[RFC PATCH v1 53/57] arm64: Runtime-fold pmd level

From: Ryan Roberts
Date: Mon Oct 14 2024 - 07:10:42 EST


For a given VA size, the number of levels of lookup depends on the page
size. With boot-time page size selection, we therefore don't know how
many levels of lookup we require until boot time. So we need to
runtime-fold some levels of lookup.

We already have code to runtime-fold p4d and pud levels; that exists for
LPA2 fallback paths and can be repurposed for our needs. But pmd level
also needs to support runtime folding; for example, 16K/36-bit and
64K/42-bit configs require only 2 levels.

So let's add the required code. However, note that until we actually add
the boot-time page size config, pgtable_l3_enabled() simply returns the
compile-time determined answer.

Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx>
---

***NOTE***
Any confused maintainers may want to read the cover note here for context:
https://lore.kernel.org/all/20241014105514.3206191-1-ryan.roberts@xxxxxxx/

arch/arm64/include/asm/pgalloc.h | 16 +++-
arch/arm64/include/asm/pgtable.h | 123 +++++++++++++++++++++++--------
arch/arm64/include/asm/tlb.h | 3 +
arch/arm64/kernel/cpufeature.c | 4 +-
arch/arm64/kvm/mmu.c | 9 +--
arch/arm64/mm/fixmap.c | 2 +-
arch/arm64/mm/hugetlbpage.c | 16 ++--
arch/arm64/mm/init.c | 2 +-
arch/arm64/mm/mmu.c | 2 +-
arch/arm64/mm/ptdump.c | 3 +-
10 files changed, 126 insertions(+), 54 deletions(-)

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 8ff5f2a2579e4..51cc2f32931d2 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -15,6 +15,7 @@

#define __HAVE_ARCH_PGD_FREE
#define __HAVE_ARCH_PUD_FREE
+#define __HAVE_ARCH_PMD_FREE
#include <asm-generic/pgalloc.h>

#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
@@ -23,7 +24,8 @@

static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
{
- set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
+ if (pgtable_l3_enabled())
+ set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
}

static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
@@ -33,6 +35,18 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN;
__pud_populate(pudp, __pa(pmdp), pudval);
}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
+ if (!pgtable_l3_enabled())
+ return;
+
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
#else
static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
{
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index fd47f70a42396..8ead41da715b0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -672,15 +672,21 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)

-#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
-static inline bool pud_sect(pud_t pud) { return false; }
-static inline bool pud_table(pud_t pud) { return true; }
-#else
-#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
- PUD_TYPE_SECT)
-#define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
- PUD_TYPE_TABLE)
-#endif
+static inline bool pgtable_l3_enabled(void);
+
+static inline bool pud_sect(pud_t pud)
+{
+ if (PAGE_SIZE == SZ_64K || !pgtable_l3_enabled())
+ return false;
+ return (pud_val(pud) & PUD_TYPE_MASK) == PUD_TYPE_SECT;
+}
+
+static inline bool pud_table(pud_t pud)
+{
+ if (PAGE_SIZE == SZ_64K || !pgtable_l3_enabled())
+ return true;
+ return (pud_val(pud) & PUD_TYPE_MASK) == PUD_TYPE_TABLE;
+}

extern pgd_t init_pg_dir[];
extern pgd_t init_pg_end[];
@@ -699,12 +705,10 @@ static inline bool in_swapper_pgdir(void *addr)

static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
-#ifdef __PAGETABLE_PMD_FOLDED
- if (in_swapper_pgdir(pmdp)) {
+ if (!pgtable_l3_enabled() && in_swapper_pgdir(pmdp)) {
set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd)));
return;
}
-#endif /* __PAGETABLE_PMD_FOLDED */

WRITE_ONCE(*pmdp, pmd);

@@ -749,20 +753,27 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)

#if CONFIG_PGTABLE_LEVELS > 2

+static __always_inline bool pgtable_l3_enabled(void)
+{
+ return true;
+}
+
+static inline bool mm_pmd_folded(const struct mm_struct *mm)
+{
+ return !pgtable_l3_enabled();
+}
+#define mm_pmd_folded mm_pmd_folded
+
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))

-#define pud_none(pud) (!pud_val(pud))
-#define pud_bad(pud) (!pud_table(pud))
-#define pud_present(pud) pte_present(pud_pte(pud))
-#ifndef __PAGETABLE_PMD_FOLDED
-#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
-#else
-#define pud_leaf(pud) false
-#endif
-#define pud_valid(pud) pte_valid(pud_pte(pud))
-#define pud_user(pud) pte_user(pud_pte(pud))
-#define pud_user_exec(pud) pte_user_exec(pud_pte(pud))
+#define pud_none(pud) (pgtable_l3_enabled() && !pud_val(pud))
+#define pud_bad(pud) (pgtable_l3_enabled() && !pud_table(pud))
+#define pud_present(pud) (!pgtable_l3_enabled() || pte_present(pud_pte(pud)))
+#define pud_leaf(pud) (pgtable_l3_enabled() && pte_present(pud_pte(pud)) && !pud_table(pud))
+#define pud_valid(pud) (pgtable_l3_enabled() && pte_valid(pud_pte(pud)))
+#define pud_user(pud) (pgtable_l3_enabled() && pte_user(pud_pte(pud)))
+#define pud_user_exec(pud) (pgtable_l3_enabled() && pte_user_exec(pud_pte(pud)))

static inline bool pgtable_l4_enabled(void);

@@ -783,7 +794,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud)

static inline void pud_clear(pud_t *pudp)
{
- set_pud(pudp, __pud(0));
+ if (pgtable_l3_enabled())
+ set_pud(pudp, __pud(0));
}

static inline phys_addr_t pud_page_paddr(pud_t pud)
@@ -791,25 +803,74 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
return __pud_to_phys(pud);
}

+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+
+static inline pmd_t *pud_to_folded_pmd(pud_t *pudp, unsigned long addr)
+{
+ return (pmd_t *)pudp;
+}
+
static inline pmd_t *pud_pgtable(pud_t pud)
{
return (pmd_t *)__va(pud_page_paddr(pud));
}

-/* Find an entry in the second-level page table. */
-#define pmd_offset_phys(dir, addr) (pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t))
+static inline phys_addr_t pmd_offset_phys(pud_t *pudp, unsigned long addr)
+{
+ BUG_ON(!pgtable_l3_enabled());
+
+ return pud_page_paddr(READ_ONCE(*pudp)) + pmd_index(addr) * sizeof(pmd_t);
+}
+
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud,
+ unsigned long addr)
+{
+ if (!pgtable_l3_enabled())
+ return pud_to_folded_pmd(pudp, addr);
+ return (pmd_t *)__va(pud_page_paddr(pud)) + pmd_index(addr);
+}
+#define pmd_offset_lockless pmd_offset_lockless

-#define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
-#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr))
-#define pmd_clear_fixmap() clear_fixmap(FIX_PMD)
+static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long addr)
+{
+ return pmd_offset_lockless(pudp, READ_ONCE(*pudp), addr);
+}
+#define pmd_offset pmd_offset

-#define pud_page(pud) phys_to_page(__pud_to_phys(pud))
+static inline pmd_t *pmd_set_fixmap(unsigned long addr)
+{
+ if (!pgtable_l3_enabled())
+ return NULL;
+ return (pmd_t *)set_fixmap_offset(FIX_PMD, addr);
+}
+
+static inline pmd_t *pmd_set_fixmap_offset(pud_t *pudp, unsigned long addr)
+{
+ if (!pgtable_l3_enabled())
+ return pud_to_folded_pmd(pudp, addr);
+ return pmd_set_fixmap(pmd_offset_phys(pudp, addr));
+}
+
+static inline void pmd_clear_fixmap(void)
+{
+ if (pgtable_l3_enabled())
+ clear_fixmap(FIX_PMD);
+}

/* use ONLY for statically allocated translation tables */
-#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
+static inline pmd_t *pmd_offset_kimg(pud_t *pudp, u64 addr)
+{
+ if (!pgtable_l3_enabled())
+ return pud_to_folded_pmd(pudp, addr);
+ return (pmd_t *)__phys_to_kimg(pmd_offset_phys(pudp, addr));
+}
+
+#define pud_page(pud) phys_to_page(__pud_to_phys(pud))

#else

+static inline bool pgtable_l3_enabled(void) { return false; }
+
#define pud_valid(pud) false
#define pud_page_paddr(pud) ({ BUILD_BUG(); 0; })
#define pud_user_exec(pud) pud_user(pud) /* Always 0 with folding */
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index a947c6e784ed2..527630f0803c6 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -92,6 +92,9 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
{
struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);

+ if (!pgtable_l3_enabled())
+ return;
+
pagetable_pmd_dtor(ptdesc);
tlb_remove_ptdesc(tlb, ptdesc);
}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e5618423bb99d..663cc76569a27 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1923,8 +1923,10 @@ static int __init __kpti_install_ng_mappings(void *__unused)

if (levels == 5 && !pgtable_l5_enabled())
levels = 4;
- else if (levels == 4 && !pgtable_l4_enabled())
+ if (levels == 4 && !pgtable_l4_enabled())
levels = 3;
+ if (levels == 3 && !pgtable_l3_enabled())
+ levels = 2;

remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 248a2d7ad6dbb..146ecdaaaf647 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1370,12 +1370,11 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)

pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);

-#ifndef __PAGETABLE_PMD_FOLDED
- if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
+ if (pgtable_l3_enabled() &&
+ (hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
ALIGN(hva, PUD_SIZE) <= vma->vm_end)
return PUD_SHIFT;
-#endif

if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
@@ -1487,12 +1486,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_shift = get_vma_page_shift(vma, hva);
}

-#ifndef __PAGETABLE_PMD_FOLDED
- if (vma_shift == PUD_SHIFT) {
+ if (pgtable_l3_enabled() && vma_shift == PUD_SHIFT) {
if (!fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
vma_shift = PMD_SHIFT;
}
-#endif
if (vma_shift == CONT_PMD_SHIFT) {
vma_shift = PMD_SHIFT;
}
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index a0dcf2375ccb4..f2c6678046a96 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -87,7 +87,7 @@ static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr,
p4d_t p4d = READ_ONCE(*p4dp);
pud_t *pudp;

- if (CONFIG_PGTABLE_LEVELS > 3 && !p4d_none(p4d) &&
+ if (ptg_pgtable_levels > 3 && !p4d_none(p4d) &&
p4d_page_paddr(p4d) != __pa_symbol(bm_pud)) {
/*
* We only end up here if the kernel mapping and the fixmap
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index bc98c20655bba..2add0839179e3 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -51,10 +51,9 @@ void __init arm64_hugetlb_cma_reserve(void)

static bool __hugetlb_valid_size(unsigned long size)
{
-#ifndef __PAGETABLE_PMD_FOLDED
- if (size == PUD_SIZE)
+ if (pgtable_l3_enabled() && size == PUD_SIZE)
return pud_sect_supported();
-#endif
+
if (size == CONT_PMD_SIZE || size == PMD_SIZE || size == CONT_PTE_SIZE)
return true;

@@ -100,13 +99,10 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize)

*pgsize = size;

-#ifndef __PAGETABLE_PMD_FOLDED
- if (size == PUD_SIZE) {
+ if (pgtable_l3_enabled() && size == PUD_SIZE) {
if (pud_sect_supported())
contig_ptes = 1;
- } else
-#endif
- if (size == PMD_SIZE) {
+ } else if (size == PMD_SIZE) {
contig_ptes = 1;
} else if (size == CONT_PMD_SIZE) {
*pgsize = PMD_SIZE;
@@ -331,10 +327,8 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
{
unsigned long hp_size = huge_page_size(h);

-#ifndef __PAGETABLE_PMD_FOLDED
- if (hp_size == PUD_SIZE)
+ if (pgtable_l3_enabled() && hp_size == PUD_SIZE)
return PGDIR_SIZE - PUD_SIZE;
-#endif
if (hp_size == CONT_PMD_SIZE)
return PUD_SIZE - CONT_PMD_SIZE;
if (hp_size == PMD_SIZE)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 4d24034418b39..62587104f30d8 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -396,7 +396,7 @@ void __init mem_init(void)
* scratch using the virtual address range and page size.
*/
VM_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
- CONFIG_PGTABLE_LEVELS);
+ ptg_pgtable_levels);

if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ad7fd3fda705a..b78a341cd9e70 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1046,7 +1046,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
} while (addr = next, addr < end);

- if (CONFIG_PGTABLE_LEVELS <= 2)
+ if (!pgtable_l3_enabled())
return;

if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 6986827e0d645..045a4188afc10 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -230,7 +230,8 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,

/* check if the current level has been folded dynamically */
if ((level == 1 && mm_p4d_folded(st->mm)) ||
- (level == 2 && mm_pud_folded(st->mm)))
+ (level == 2 && mm_pud_folded(st->mm)) ||
+ (level == 3 && mm_pmd_folded(st->mm)))
level = 0;

if (level >= 0)
--
2.43.0