Re: [PATCH mm-unstable v18 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse

From: Lance Yang

Date: Sun May 31 2026 - 05:40:07 EST

On Fri, May 22, 2026 at 09:00:01AM -0600, Nico Pache wrote:
>Pass an order and offset to collapse_huge_page to support collapsing anon
>memory to arbitrary orders within a PMD. order indicates what mTHP size we
>are attempting to collapse to, and offset indicates were in the PMD to
>start the collapse attempt.
>
>For non-PMD collapse we must leave the anon VMA write locked until after
>we collapse the mTHP-- in the PMD case all the pages are isolated, but in
>the mTHP case this is not true, and we must keep the lock to prevent
>access/changes to the page tables. This can happen if the rmap walkers hit
>a pmd_none while the PMD entry is currently unavailable due to being
>temporarily removed during the collapse phase.
>
>Acked-by: Usama Arif <usama.arif@xxxxxxxxx>
>Signed-off-by: Nico Pache <npache@xxxxxxxxxx>
>---
> mm/khugepaged.c | 93 +++++++++++++++++++++++++++++--------------------
> 1 file changed, 55 insertions(+), 38 deletions(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index fab35d318641..d64f42f66236 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -1214,34 +1214,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
> * while allocating a THP, as that could trigger direct reclaim/compaction.
> * Note that the VMA must be rechecked after grabbing the mmap_lock again.
> */
>-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
>- int referenced, int unmapped, struct collapse_control *cc)
>+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
>+ int referenced, int unmapped, struct collapse_control *cc,
>+ unsigned int order)
> {
>+ const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
>+ const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
> LIST_HEAD(compound_pagelist);
> pmd_t *pmd, _pmd;
>- pte_t *pte;
>+ pte_t *pte = NULL;
> pgtable_t pgtable;
> struct folio *folio;
> spinlock_t *pmd_ptl, *pte_ptl;
> enum scan_result result = SCAN_FAIL;
> struct vm_area_struct *vma;
> struct mmu_notifier_range range;
>+ bool anon_vma_locked = false;
>
>- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>-
>- result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>+ result = alloc_charge_folio(&folio, mm, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
>
> mmap_read_lock(mm);
>- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>- HPAGE_PMD_ORDER);
>+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>+ &vma, cc, order);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> goto out_nolock;
> }
>
>- result = find_pmd_or_thp_or_none(mm, address, &pmd);
>+ result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> goto out_nolock;
>@@ -1253,8 +1255,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * released when it fails. So we jump out_nolock directly in
> * that case. Continuing to collapse causes inconsistency.
> */
>- result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>- referenced, HPAGE_PMD_ORDER);
>+ result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
>+ referenced, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
>@@ -1269,20 +1271,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * mmap_lock.
> */
> mmap_write_lock(mm);
>- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>- HPAGE_PMD_ORDER);
>+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>+ &vma, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> vma_start_write(vma);
>- result = check_pmd_still_valid(mm, address, pmd);
>+ result = check_pmd_still_valid(mm, pmd_addr, pmd);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
>
> anon_vma_lock_write(vma->anon_vma);
>+ anon_vma_locked = true;
>
>- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
>- address + HPAGE_PMD_SIZE);
>+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
>+ end_addr);
> mmu_notifier_invalidate_range_start(&range);
>
> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
>@@ -1294,26 +1297,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * Parallel GUP-fast is fine since GUP-fast will back off when
> * it detects PMD is changed.
> */
>- _pmd = pmdp_collapse_flush(vma, address, pmd);
>+ _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> tlb_remove_table_sync_one();
>
>- pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>+ pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
> if (pte) {
>- result = __collapse_huge_page_isolate(vma, address, pte, cc,
>- HPAGE_PMD_ORDER,
>- &compound_pagelist);
>+ result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
>+ order, &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_NO_PTE_TABLE;
> }
>
> if (unlikely(result != SCAN_SUCCEED)) {
>- if (pte)
>- pte_unmap(pte);
> spin_lock(pmd_ptl);
>- BUG_ON(!pmd_none(*pmd));
>+ WARN_ON_ONCE(!pmd_none(*pmd));
> /*
> * We can only use set_pmd_at when establishing
> * hugepmds and never for establishing regular pmds that
>@@ -1321,21 +1321,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> */
> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> spin_unlock(pmd_ptl);
>- anon_vma_unlock_write(vma->anon_vma);
> goto out_up_write;
> }
>
> /*
>- * All pages are isolated and locked so anon_vma rmap
>- * can't run anymore.
>+ * For PMD collapse all pages are isolated and locked so anon_vma
>+ * rmap can't run anymore. For mTHP collapse the PMD entry has been
>+ * removed and not all pages are isolated and locked, so we must hold
>+ * the lock to prevent neighboring folios from attempting to access
>+ * this PMD until its reinstalled.
> */
>- anon_vma_unlock_write(vma->anon_vma);
>+ if (is_pmd_order(order)) {
>+ anon_vma_unlock_write(vma->anon_vma);
>+ anon_vma_locked = false;
>+ }
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>- vma, address, pte_ptl,
>- HPAGE_PMD_ORDER,
>- &compound_pagelist);
>- pte_unmap(pte);
>+ vma, start_addr, pte_ptl,
>+ order, &compound_pagelist);
> if (unlikely(result != SCAN_SUCCEED))
> goto out_up_write;
>
>@@ -1345,18 +1348,32 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * write.
> */
> __folio_mark_uptodate(folio);
>- pgtable = pmd_pgtable(_pmd);
>-
> spin_lock(pmd_ptl);
>- BUG_ON(!pmd_none(*pmd));
>- pgtable_trans_huge_deposit(mm, pmd, pgtable);
>- map_anon_folio_pmd_nopf(folio, pmd, vma, address);
>+ WARN_ON_ONCE(!pmd_none(*pmd));
>+ if (is_pmd_order(order)) {
>+ pgtable = pmd_pgtable(_pmd);
>+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
>+ map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
>+ } else {
>+ /*
>+ * set_ptes is called in map_anon_folio_pte_nopf with the
>+ * pmd_ptl lock still held; this is safe as the PMD is expected
>+ * to be none. The pmd entry is then repopulated below.
>+ */
>+ map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);

Emm ... is it safe to use map_anon_folio_pte_nopf() here?

At this point pmdp_collapse_flush() has cleared the PMD from the page
tables. The PTE table we are updating is only reachable through the saved
old PMD value, _pmd, until pmd_populate() below.

map_anon_folio_pte_nopf() does set_ptes() and then calls
update_mmu_cache_range(). Documentation/core-api/cachetlb.rst describes
that hook as:

"
At the end of every page fault, this routine is invoked to tell
the architecture specific code that translations now exists
in the software page tables for address space "vma->vm_mm"
at virtual address "address" for "nr" consecutive pages.
"

But that does not seem true here yet, since the PTE table is not
reachable from vma->vm_mm when update_mmu_cache_range() is called.

Should we avoid calling update_mmu_cache_range() until after the PTE
table is reinstalled with pmd_populate()?

Cheers, Lance

>+ smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
>+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>+ }
> spin_unlock(pmd_ptl);
>
> folio = NULL;
>
> result = SCAN_SUCCEED;
> out_up_write:
>+ if (anon_vma_locked)
>+ anon_vma_unlock_write(vma->anon_vma);
>+ if (pte)
>+ pte_unmap(pte);
> mmap_write_unlock(mm);
> out_nolock:
> if (folio)
>@@ -1536,7 +1553,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> /* collapse_huge_page expects the lock to be dropped before calling */
> mmap_read_unlock(mm);
> result = collapse_huge_page(mm, start_addr, referenced,
>- unmapped, cc);
>+ unmapped, cc, HPAGE_PMD_ORDER);
> /* collapse_huge_page will return with the mmap_lock released */
> *lock_dropped = true;
> }
>--
>2.54.0
>
>