Re: [PATCH v3 4/6] mm/vmalloc: Extend page table walk to support larger page_shift sizes and eliminate page table rewalk
From: Wen Jiang
Date: Mon Jun 08 2026 - 02:26:44 EST
On Fri, 5 Jun 2026 at 14:02, Dev Jain <dev.jain@xxxxxxx> wrote:
>
>
>
> On 28/05/26 9:09 am, Wen Jiang wrote:
> > On Wed, 27 May 2026 at 13:59, Dev Jain <dev.jain@xxxxxxx> wrote:
> >>
> >>
> >>
> >> On 22/05/26 11:01 am, Wen Jiang wrote:
> >> From: "Barry Song (Xiaomi)" <baohua@xxxxxxxxxx>
> >>
> >> vmap_pages_range_noflush_walk() (formerly vmap_small_pages_range_noflush())
> >> provides a clean interface by taking struct page **pages and mapping them
> >> via direct PTE iteration. This avoids the page table rewalk seen when
> >> using vmap_range_noflush() for page_shift values other than PAGE_SHIFT.
> >>
> >> Extend it to support larger page_shift values, and add PMD- and
> >> contiguous-PTE mappings as well. Rename it to vmap_pages_range_noflush_walk()
> >> since it now handles more than just small pages.
> >>
> >> For vmalloc() allocations with VM_ALLOW_HUGE_VMAP, we no longer need to
> >> iterate over pages one by one via vmap_range_noflush(), which would
> >> otherwise lead to page table rewalk. The code is now unified with the
> >> PAGE_SHIFT case by simply calling vmap_pages_range_noflush_walk().
> >>
> >> Signed-off-by: Barry Song (Xiaomi) <baohua@xxxxxxxxxx>
> >> Signed-off-by: Wen Jiang <jiangwen6@xxxxxxxxxx>
> >> Tested-by: Xueyuan Chen <xueyuan.chen21@xxxxxxxxx>
> >> ---
> >> mm/vmalloc.c | 71 +++++++++++++++++++++++++++++-----------------------
> >> 1 file changed, 40 insertions(+), 31 deletions(-)
> >>
> >> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> >> index 53fd4ee460ea4..deb764abc0571 100644
> >> --- a/mm/vmalloc.c
> >> +++ b/mm/vmalloc.c
> >> @@ -543,8 +543,10 @@ void vunmap_range(unsigned long addr, unsigned long end)
> >>
> >> static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
> >> unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> >> - pgtbl_mod_mask *mask)
> >> + pgtbl_mod_mask *mask, unsigned int shift)
> >> {
> >> + unsigned long pfn, size;
> >> + unsigned int steps;
> >> int err = 0;
> >> pte_t *pte;
> >>
> >> @@ -575,9 +577,10 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
> >> break;
> >> }
> >>
> >> - set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
> >> - (*nr)++;
> >> - } while (pte++, addr += PAGE_SIZE, addr != end);
> >> + pfn = page_to_pfn(page);
> >> + size = vmap_set_ptes(pte, addr, end, pfn, prot, shift);
> >> + steps = PFN_DOWN(size);
> >> + } while (pte += steps, *nr += steps, addr += size, addr != end);
> >>
> >> lazy_mmu_mode_disable();
> >> *mask |= PGTBL_PTE_MODIFIED;
> >> @@ -587,7 +590,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
> >>
> >> static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
> >> unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> >> - pgtbl_mod_mask *mask)
> >> + pgtbl_mod_mask *mask, unsigned int shift)
> >> {
> >> pmd_t *pmd;
> >> unsigned long next;
> >>> @@ -597,7 +600,27 @@ static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
> >>> return -ENOMEM;
> >>> do {
> >>> next = pmd_addr_end(addr, end);
> >>> - if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
> >>> +
> >>> + if (shift == PMD_SHIFT) {
> >>> + struct page *page = pages[*nr];
> >>> + phys_addr_t phys_addr;
> >>> +
> >>> + if (WARN_ON(!page))
> >>> + return -ENOMEM;
> >>> + if (WARN_ON(!pfn_valid(page_to_pfn(page))))
> >>> + return -EINVAL;
> >>
> >>
> >> So I know these !page and !pfn_valid checks have been copied from vmap_pages_pte_range,
> >> but do they mean anything?
> >>
> >> I think pfn_valid() makes sense in that someone may take a random VA/PA, convert it into a struct
> >> page and pass to vmap layer. But I don't see how anyone would pass page == NULL? At the
> >> very least, returning ENOMEM does not make sense because the pages are not being
> >> allocated by vmap() but have already been allocated.
> >
> > Hi Dev,
> >
> > vmap() is EXPORT_SYMBOL with many callers across drivers, each
> > constructing the pages array differently. The !page check guards
> > against malformed arrays at this API boundary.
> >
> > The same -ENOMEM issue also exists in vmap_pages_pte_range().
> > Should I fix both in this patchset or leave it as a separate cleanup?
> >
> >>
> >>> +
> >>> + phys_addr = page_to_phys(page);
> >>> +
> >>> + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
> >>> + shift)) {
> >>> + *mask |= PGTBL_PMD_MODIFIED;
> >>> + *nr += 1 << (shift - PAGE_SHIFT);
> >>> + continue;
> >>> + }
> >>> + }
> >>> +
> >>> + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask, shift))
> >>> return -ENOMEM;
> >>> } while (pmd++, addr = next, addr != end);
> >>> return 0;
> >>> @@ -605,7 +628,7 @@ static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
> >>>
> >>> static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
> >>> unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> >>> - pgtbl_mod_mask *mask)
> >>> + pgtbl_mod_mask *mask, unsigned int shift)
> >>> {
> >>> pud_t *pud;
> >>> unsigned long next;
> >>> @@ -615,7 +638,7 @@ static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
> >>> return -ENOMEM;
> >>> do {
> >>> next = pud_addr_end(addr, end);
> >>> - if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
> >>> + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask, shift))
> >>> return -ENOMEM;
> >>> } while (pud++, addr = next, addr != end);
> >>> return 0;
> >>> @@ -623,7 +646,7 @@ static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
> >>>
> >>> static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
> >>> unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> >>> - pgtbl_mod_mask *mask)
> >>> + pgtbl_mod_mask *mask, unsigned int shift)
> >>> {
> >>> p4d_t *p4d;
> >>> unsigned long next;
> >>> @@ -633,14 +656,14 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
> >>> return -ENOMEM;
> >>> do {
> >>> next = p4d_addr_end(addr, end);
> >>> - if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
> >>> + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask, shift))
> >>> return -ENOMEM;
> >>> } while (p4d++, addr = next, addr != end);
> >>> return 0;
> >>> }
> >>>
> >>> -static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
> >>> - pgprot_t prot, struct page **pages)
> >>> +static int vmap_pages_range_noflush_walk(unsigned long addr, unsigned long end,
> >>> + pgprot_t prot, struct page **pages, unsigned int shift)
> >>> {
> >>> unsigned long start = addr;
> >>> pgd_t *pgd;
> >>> @@ -655,7 +678,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
> >>> next = pgd_addr_end(addr, end);
> >>> if (pgd_bad(*pgd))
> >>> mask |= PGTBL_PGD_MODIFIED;
> >>> - err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
> >>> + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask, shift);
> >>> if (err)
> >>> break;
> >>> } while (pgd++, addr = next, addr != end);
> >>> @@ -678,27 +701,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
> >>> int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
> >>> pgprot_t prot, struct page **pages, unsigned int page_shift)
> >>> {
> >>> - unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
> >>> -
> >>> WARN_ON(page_shift < PAGE_SHIFT);
> >>>
> >>> - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> >>> - page_shift == PAGE_SHIFT)
> >>> - return vmap_small_pages_range_noflush(addr, end, prot, pages);
> >>> + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC))
> >>> + page_shift = PAGE_SHIFT;
> >>>
> >>> - for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
> >>> - int err;
> >>> -
> >>> - err = vmap_range_noflush(addr, addr + (1UL << page_shift),
> >>> - page_to_phys(pages[i]), prot,
> >>> - page_shift);
> >>> - if (err)
> >>> - return err;
> >>> -
> >>> - addr += 1UL << page_shift;
> >>> - }
> >>> -
> >>> - return 0;
> >>> + return vmap_pages_range_noflush_walk(addr, end, prot, pages,
> >>> + min(page_shift, PMD_SHIFT));
> >>
> >>
> >> We can easily extend to PUD huge mappings right? Not sure whether we
> >> should keep everything symmetric to how vmap_range_noflush() operates
> >> right now, since P4D mappings don't exist, but PUD looks worthwhile.
> >>
> >
> > PUD mapping requires 1GB of contiguous physical memory, but the buddy
> > allocator's MAX_PAGE_ORDER is 10 (4MB on 4K pages). So page_shift
> > passed to vmap_pages_range_noflush_walk() never exceeds PMD_SHIFT.
>
> Can we then just drop the min()? You can guard the try_huge_pmd with
> shift >= PMD_SHIFT - the walker has the necessary ingredients to work
> with a shift > PMD_SHIFT, so let us not confuse by this min() truncation.
>
Will Drop the min() here.
Thanks.
> >
> > Thanks,
> > Wen
> >>> }
> >>>
> >>> int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
> >>
>