Re: [v3 PATCH 3/6] arm64: mm: make __create_pgd_mapping() and helpers non-void

From: Yang Shi
Date: Mon Mar 17 2025 - 13:54:03 EST




On 3/14/25 4:51 AM, Ryan Roberts wrote:
On 04/03/2025 22:19, Yang Shi wrote:
The later patch will enhance __create_pgd_mapping() and related helpers
to split kernel linear mapping, it requires have return value. So make
__create_pgd_mapping() and helpers non-void functions.

And move the BUG_ON() out of page table alloc helper since failing
splitting kernel linear mapping is not fatal and can be handled by the
callers in the later patch. Have BUG_ON() after
__create_pgd_mapping_locked() returns to keep the current callers behavior
intact.

Suggested-by: Ryan Roberts<ryan.roberts@xxxxxxx>
Signed-off-by: Yang Shi<yang@xxxxxxxxxxxxxxxxxxxxxx>
---
arch/arm64/mm/mmu.c | 127 ++++++++++++++++++++++++++++++--------------
1 file changed, 86 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b4df5bc5b1b8..dccf0877285b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -189,11 +189,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
} while (ptep++, addr += PAGE_SIZE, addr != end);
}
-static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
- unsigned long end, phys_addr_t phys,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, phys_addr_t phys,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
{
unsigned long next;
pmd_t pmd = READ_ONCE(*pmdp);
@@ -208,6 +208,8 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
pmdval |= PMD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pte_phys = pgtable_alloc(PAGE_SHIFT);
+ if (!pte_phys)
+ return -ENOMEM;
nit: personally I'd prefer to see a "goto out" and funnel all to a single return
statement. You do that in some functions (via loop break), but would be cleaner
if consistent.

If pgtable_alloc() is modified to return int (see my comment at the bottom),
this becomes:

ret = pgtable_alloc(PAGE_SHIFT, &pte_phys);
if (ret)
goto out;

OK

ptep = pte_set_fixmap(pte_phys);
init_clear_pgtable(ptep);
ptep += pte_index(addr);
@@ -239,13 +241,16 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
* walker.
*/
pte_clear_fixmap();
+
+ return 0;
}
-static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags)
{
unsigned long next;
+ int ret = 0;
do {
pmd_t old_pmd = READ_ONCE(*pmdp);
@@ -264,22 +269,27 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
READ_ONCE(pmd_val(*pmdp))));
} else {
- alloc_init_cont_pte(pmdp, addr, next, phys, prot,
+ ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
pgtable_alloc, flags);
+ if (ret)
+ break;
BUG_ON(pmd_val(old_pmd) != 0 &&
pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
}
phys += next - addr;
} while (pmdp++, addr = next, addr != end);
+
+ return ret;
}
-static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
- unsigned long end, phys_addr_t phys,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
+ unsigned long end, phys_addr_t phys,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags)
{
unsigned long next;
+ int ret = 0;
pud_t pud = READ_ONCE(*pudp);
pmd_t *pmdp;
@@ -295,6 +305,8 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
pudval |= PUD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pmd_phys = pgtable_alloc(PMD_SHIFT);
+ if (!pmd_phys)
+ return -ENOMEM;
pmdp = pmd_set_fixmap(pmd_phys);
init_clear_pgtable(pmdp);
pmdp += pmd_index(addr);
@@ -314,21 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
- init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+ ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+ if (ret)
+ break;
pmdp += pmd_index(next) - pmd_index(addr);
phys += next - addr;
} while (addr = next, addr != end);
pmd_clear_fixmap();
+
+ return ret;
}
-static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
{
unsigned long next;
+ int ret = 0;
p4d_t p4d = READ_ONCE(*p4dp);
pud_t *pudp;
@@ -340,6 +357,8 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pud_phys = pgtable_alloc(PUD_SHIFT);
+ if (!pud_phys)
+ return -ENOMEM;
pudp = pud_set_fixmap(pud_phys);
init_clear_pgtable(pudp);
pudp += pud_index(addr);
@@ -369,8 +388,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
READ_ONCE(pud_val(*pudp))));
} else {
- alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+ ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
pgtable_alloc, flags);
+ if (ret)
+ break;
BUG_ON(pud_val(old_pud) != 0 &&
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
@@ -379,14 +400,17 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
} while (pudp++, addr = next, addr != end);
pud_clear_fixmap();
+
+ return ret;
}
-static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
{
unsigned long next;
+ int ret = 0;
pgd_t pgd = READ_ONCE(*pgdp);
p4d_t *p4dp;
@@ -398,6 +422,8 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
pgdval |= PGD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
p4d_phys = pgtable_alloc(P4D_SHIFT);
+ if (!p4d_phys)
+ return -ENOMEM;
p4dp = p4d_set_fixmap(p4d_phys);
init_clear_pgtable(p4dp);
p4dp += p4d_index(addr);
@@ -412,8 +438,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
next = p4d_addr_end(addr, end);
- alloc_init_pud(p4dp, addr, next, phys, prot,
+ ret = alloc_init_pud(p4dp, addr, next, phys, prot,
pgtable_alloc, flags);
+ if (ret)
+ break;
BUG_ON(p4d_val(old_p4d) != 0 &&
p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
@@ -422,23 +450,26 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
} while (p4dp++, addr = next, addr != end);
p4d_clear_fixmap();
+
+ return ret;
}
-static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
- unsigned long virt, phys_addr_t size,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
{
unsigned long addr, end, next;
pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
+ int ret = 0;
/*
* If the virtual and physical address don't have the same offset
* within a page, we cannot map the region as the caller expects.
*/
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
- return;
+ return -EINVAL;
phys &= PAGE_MASK;
addr = virt & PAGE_MASK;
@@ -446,29 +477,38 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
do {
next = pgd_addr_end(addr, end);
- alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
+ ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
flags);
+ if (ret)
+ break;
phys += next - addr;
} while (pgdp++, addr = next, addr != end);
+
+ return ret;
}
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
- unsigned long virt, phys_addr_t size,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
{
+ int ret;
+
mutex_lock(&fixmap_lock);
- __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
+ ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
pgtable_alloc, flags);
+ BUG_ON(ret);
This function now returns an error, but also BUGs on ret!=0. For this patch, I'd
suggest keeping this function as void.

You mean __create_pgd_mapping(), right?

But I believe there is a pre-existing bug in arch_add_memory(). That's called at
runtime so if __create_pgd_mapping() fails and BUGs, it will take down a running
system.

Yes, it is the current behavior.

With this foundational patch, we can fix that with an additional patch to pass
along the error code instead of BUGing in that case. arch_add_memory() would
need to unwind whatever __create_pgd_mapping() managed to do before the memory
allocation failure (presumably unmapping and freeing any allocated tables). I'm
happy to do this as a follow up patch.

Yes, the allocated page tables need to be freed. Thank you for taking it.

mutex_unlock(&fixmap_lock);
+
+ return ret;
}
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
extern __alias(__create_pgd_mapping_locked)
-void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
- phys_addr_t size, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags);
+int create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags);
create_kpti_ng_temp_pgd() now returns error instead of BUGing on allocation
failure, but I don't see a change to handle that error. You'll want to update
__kpti_install_ng_mappings() to BUG on error.

Yes, I missed that. It should BUG on error.

#endif
static phys_addr_t __pgd_pgtable_alloc(int shift)
@@ -476,13 +516,17 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
- BUG_ON(!ptr);
+ if (!ptr)
+ return 0;
0 is a valid (though unlikely) physical address. I guess you could technically
encode like ERR_PTR(), but since you are returning phys_addr_t and not a
pointer, then perhaps it will be clearer to make this return int and accept a
pointer to a phys_addr_t, which it will populate on success?

Actually I did something similar in the first place, but just returned the virt address. Then did something if it returns NULL. That made the code a little more messy since we need convert the virt address to phys address because __create_pgd_mapping() and the helpers require phys address, and changed the functions definition.

But I noticed 0 should be not a valid phys address if I remember correctly. I also noticed early_pgtable_alloc() calls memblock_phys_alloc_range(), it returns 0 on failure. If 0 is valid phys address, then it should not do that, right? And I also noticed the memblock range 0 - memstart_addr is actually removed from memblock (see arm64_memblock_init()), so IIUC 0 should be not valid phys address. So the patch ended up being as is.

If this assumption doesn't stand, I think your suggestion makes sense.

+
return __pa(ptr);
}
static phys_addr_t pgd_pgtable_alloc(int shift)
{
phys_addr_t pa = __pgd_pgtable_alloc(shift);
+ if (!pa)
+ goto out;
This would obviously need to be fixed up as per above.

struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
/*
@@ -498,6 +542,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
else if (shift == PMD_SHIFT)
BUG_ON(!pagetable_pmd_ctor(ptdesc));
+out:
return pa;
}
You have left early_pgtable_alloc() to panic() on allocation failure. Given we
can now unwind the stack with error code, I think it would be more consistent to
also allow early_pgtable_alloc() to return error.

The early_pgtable_alloc() is just used for painting linear mapping at early boot stage, if it fails I don't think unwinding the stack is feasible and worth it. Did I miss something?

Thanks,
Yang

Thanks,
Ryan