Re: [PATCH 3/4] arm64: mm: support large block mapping when rodata=full

From: Dev Jain
Date: Tue Jul 29 2025 - 08:35:20 EST



On 25/07/25 3:41 am, Yang Shi wrote:
[----- snip -----]
#ifdef CONFIG_ARM64_PAN
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3d5fb37424ab..f63b39613571 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -480,6 +480,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
int flags);
#endif
+#define INVALID_PHYS_ADDR -1
+
static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
enum pgtable_type pgtable_type)
{
@@ -487,7 +489,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
phys_addr_t pa;
- BUG_ON(!ptdesc);
+ if (!ptdesc)
+ return INVALID_PHYS_ADDR;
+
pa = page_to_phys(ptdesc_page(ptdesc));
switch (pgtable_type) {
@@ -509,15 +513,29 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
}
static phys_addr_t __maybe_unused
-pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+split_pgtable_alloc(enum pgtable_type pgtable_type)
{
return __pgd_pgtable_alloc(&init_mm, pgtable_type);
}
+static phys_addr_t __maybe_unused
+pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+{
+ phys_addr_t pa;
+
+ pa = __pgd_pgtable_alloc(&init_mm, pgtable_type);
+ BUG_ON(pa == INVALID_PHYS_ADDR);
+ return pa;
+}
+
static phys_addr_t
pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
{
- return __pgd_pgtable_alloc(NULL, pgtable_type);
+ phys_addr_t pa;
+
+ pa = __pgd_pgtable_alloc(NULL, pgtable_type);
+ BUG_ON(pa == INVALID_PHYS_ADDR);
+ return pa;
}
/*
@@ -552,6 +570,254 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
pgd_pgtable_alloc_special_mm, flags);
}
+static DEFINE_MUTEX(pgtable_split_lock);

Thanks for taking a separate lock.

+
+static int split_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end)
+{
+ pte_t *ptep;
+ unsigned long next;
+ unsigned int nr;
+ unsigned long span;
+
+ ptep = pte_offset_kernel(pmdp, addr);
+
+ do {
+ pte_t *_ptep;
+
+ nr = 0;
+ next = pte_cont_addr_end(addr, end);
+ if (next < end)
+ nr = max(nr, ((end - next) / CONT_PTE_SIZE));
+ span = nr * CONT_PTE_SIZE;
+
+ _ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+ ptep += pte_index(next) - pte_index(addr) + nr * CONT_PTES;
+
+ if (((addr | next) & ~CONT_PTE_MASK) == 0)
+ continue;
+
+ if (!pte_cont(__ptep_get(_ptep)))
+ continue;
+
+ for (int i = 0; i < CONT_PTES; i++, _ptep++)
+ __set_pte(_ptep, pte_mknoncont(__ptep_get(_ptep)));
+ } while (addr = next + span, addr != end);
+
+ return 0;
+}
+
+static int split_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ unsigned int nr;
+ unsigned long span;
+ int ret = 0;
+
+ do {
+ pmd_t pmd;
+
+ nr = 1;
+ next = pmd_addr_end(addr, end);
+ if (next < end)
+ nr = max(nr, ((end - next) / PMD_SIZE));
+ span = (nr - 1) * PMD_SIZE;
+
+ if (((addr | next) & ~PMD_MASK) == 0)
+ continue;
+
+ pmd = pmdp_get(pmdp);
+ if (pmd_leaf(pmd)) {
+ phys_addr_t pte_phys;
+ pte_t *ptep;
+ pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN |
+ PMD_TABLE_AF;
+ unsigned long pfn = pmd_pfn(pmd);
+ pgprot_t prot = pmd_pgprot(pmd);
+
+ pte_phys = split_pgtable_alloc(TABLE_PTE);
+ if (pte_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
+
+ if (pgprot_val(prot) & PMD_SECT_PXN)
+ pmdval |= PMD_TABLE_PXN;
+
+ prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) |
+ PTE_TYPE_PAGE);
+ prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+ ptep = (pte_t *)phys_to_virt(pte_phys);
+ for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
+ __set_pte(ptep, pfn_pte(pfn, prot));
+
+ dsb(ishst);
+
+ __pmd_populate(pmdp, pte_phys, pmdval);
+ }
+
+ ret = split_cont_pte(pmdp, addr, next);
+ if (ret)
+ break;
+ } while (pmdp += nr, addr = next + span, addr != end);
+
+ return ret;
+}
+
+static int split_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end)
+{
+ pmd_t *pmdp;
+ unsigned long next;
+ unsigned int nr;
+ unsigned long span;
+ int ret = 0;
+
+ pmdp = pmd_offset(pudp, addr);
+
+ do {
+ pmd_t *_pmdp;
+
+ nr = 0;
+ next = pmd_cont_addr_end(addr, end);
+ if (next < end)
+ nr = max(nr, ((end - next) / CONT_PMD_SIZE));
+ span = nr * CONT_PMD_SIZE;
+
+ if (((addr | next) & ~CONT_PMD_MASK) == 0) {
+ pmdp += pmd_index(next) - pmd_index(addr) +
+ nr * CONT_PMDS;
+ continue;
+ }
+
+ _pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
+ if (!pmd_cont(pmdp_get(_pmdp)))
+ goto split;
+
+ for (int i = 0; i < CONT_PMDS; i++, _pmdp++)
+ set_pmd(_pmdp, pmd_mknoncont(pmdp_get(_pmdp)));
+
+split:
+ ret = split_pmd(pmdp, addr, next);
+ if (ret)
+ break;
+
+ pmdp += pmd_index(next) - pmd_index(addr) + nr * CONT_PMDS;
+ } while (addr = next + span, addr != end);
+
+ return ret;
+}
+
+static int split_pud(p4d_t *p4dp, unsigned long addr, unsigned long end)
+{
+ pud_t *pudp;
+ unsigned long next;
+ unsigned int nr;
+ unsigned long span;
+ int ret = 0;
+
+ pudp = pud_offset(p4dp, addr);
+
+ do {
+ pud_t pud;
+
+ nr = 1;
+ next = pud_addr_end(addr, end);
+ if (next < end)
+ nr = max(nr, ((end - next) / PUD_SIZE));
+ span = (nr - 1) * PUD_SIZE;
+
+ if (((addr | next) & ~PUD_MASK) == 0)
+ continue;
+
+ pud = pudp_get(pudp);
+ if (pud_leaf(pud)) {
+ phys_addr_t pmd_phys;
+ pmd_t *pmdp;
+ pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN |
+ PUD_TABLE_AF;
+ unsigned long pfn = pud_pfn(pud);
+ pgprot_t prot = pud_pgprot(pud);
+ unsigned int step = PMD_SIZE >> PAGE_SHIFT;
+
+ pmd_phys = split_pgtable_alloc(TABLE_PMD);
+ if (pmd_phys == INVALID_PHYS_ADDR)
+ return -ENOMEM;
+
+ if (pgprot_val(prot) & PMD_SECT_PXN)
+ pudval |= PUD_TABLE_PXN;
+
+ prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) |
+ PMD_TYPE_SECT);
+ prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+ pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+ for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
+ set_pmd(pmdp, pfn_pmd(pfn, prot));
+ pfn += step;
+ }
+
+ dsb(ishst);
+
+ __pud_populate(pudp, pmd_phys, pudval);
+ }
+
+ ret = split_cont_pmd(pudp, addr, next);
+ if (ret)
+ break;
+ } while (pudp += nr, addr = next + span, addr != end);
+
+ return ret;
+}
+
+static int split_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end)
+{
+ p4d_t *p4dp;
+ unsigned long next;
+ int ret = 0;
+
+ p4dp = p4d_offset(pgdp, addr);
+
+ do {
+ next = p4d_addr_end(addr, end);
+
+ ret = split_pud(p4dp, addr, next);
+ if (ret)
+ break;
+ } while (p4dp++, addr = next, addr != end);
+
+ return ret;
+}
+
+static int split_pgd(pgd_t *pgdp, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int ret = 0;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ ret = split_p4d(pgdp, addr, next);
+ if (ret)
+ break;
+ } while (pgdp++, addr = next, addr != end);
+
+ return ret;
+}
+
+int split_kernel_pgtable_mapping(unsigned long start, unsigned long end)
+{
+ int ret;
+
+ if (!system_supports_bbml2_noabort())
+ return 0;
+
+ if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
+ return -EINVAL;
+
+ mutex_lock(&pgtable_split_lock);
+ arch_enter_lazy_mmu_mode();
+ ret = split_pgd(pgd_offset_k(start), start, end);
+ arch_leave_lazy_mmu_mode();
+ mutex_unlock(&pgtable_split_lock);
+
+ return ret;
+}
+
/*

--- snip ---

I'm afraid I'll have to agree with Ryan :) Looking at the signature of split_kernel_pgtable_mapping,
one would expect that this function splits all block mappings in this region. But that is just a
nit; it does not seem right to me that we are iterating over the entire space when we know *exactly* where
we have to make the split, just to save on pgd/p4d/pud loads - the effect of which is probably cancelled
out by unnecessary iterations your approach takes to skip the intermediate blocks.

If we are concerned that most change_memory_common() operations are for a single page, then
we can do something like:

unsigned long size = end - start;
bool end_split, start_split = false;

if (start not aligned to block mapping)
start_split = split(start);

/*
* split the end only if the start wasn't split, or
* if it cannot be guaranteed that start and end lie
* on the same contig block
*/
if (!start_split || (size > CONT_PTE_SIZE))
end_split = split(end);