Re: [PATCH mm-unstable v18 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse

From: David Hildenbrand (Arm)

Date: Sun May 31 2026 - 16:04:53 EST

On 5/31/26 11:39, Lance Yang wrote:
>
> On Fri, May 22, 2026 at 09:00:01AM -0600, Nico Pache wrote:
>> Pass an order and offset to collapse_huge_page to support collapsing anon
>> memory to arbitrary orders within a PMD. order indicates what mTHP size we
>> are attempting to collapse to, and offset indicates were in the PMD to
>> start the collapse attempt.
>>
>> For non-PMD collapse we must leave the anon VMA write locked until after
>> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
>> the mTHP case this is not true, and we must keep the lock to prevent
>> access/changes to the page tables. This can happen if the rmap walkers hit
>> a pmd_none while the PMD entry is currently unavailable due to being
>> temporarily removed during the collapse phase.
>>
>> Acked-by: Usama Arif <usama.arif@xxxxxxxxx>
>> Signed-off-by: Nico Pache <npache@xxxxxxxxxx>
>> ---
>> mm/khugepaged.c | 93 +++++++++++++++++++++++++++++--------------------
>> 1 file changed, 55 insertions(+), 38 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index fab35d318641..d64f42f66236 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1214,34 +1214,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>> * while allocating a THP, as that could trigger direct reclaim/compaction.
>> * Note that the VMA must be rechecked after grabbing the mmap_lock again.
>> */
>> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> - int referenced, int unmapped, struct collapse_control *cc)
>> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
>> + int referenced, int unmapped, struct collapse_control *cc,
>> + unsigned int order)
>> {
>> + const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
>> + const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
>> LIST_HEAD(compound_pagelist);
>> pmd_t *pmd, _pmd;
>> - pte_t *pte;
>> + pte_t *pte = NULL;
>> pgtable_t pgtable;
>> struct folio *folio;
>> spinlock_t *pmd_ptl, *pte_ptl;
>> enum scan_result result = SCAN_FAIL;
>> struct vm_area_struct *vma;
>> struct mmu_notifier_range range;
>> + bool anon_vma_locked = false;
>>
>> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> -
>> - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>> + result = alloc_charge_folio(&folio, mm, cc, order);
>> if (result != SCAN_SUCCEED)
>> goto out_nolock;
>>
>> mmap_read_lock(mm);
>> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>> - HPAGE_PMD_ORDER);
>> + result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>> + &vma, cc, order);
>> if (result != SCAN_SUCCEED) {
>> mmap_read_unlock(mm);
>> goto out_nolock;
>> }
>>
>> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> + result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
>> if (result != SCAN_SUCCEED) {
>> mmap_read_unlock(mm);
>> goto out_nolock;
>> @@ -1253,8 +1255,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>> * released when it fails. So we jump out_nolock directly in
>> * that case. Continuing to collapse causes inconsistency.
>> */
>> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> - referenced, HPAGE_PMD_ORDER);
>> + result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
>> + referenced, order);
>> if (result != SCAN_SUCCEED)
>> goto out_nolock;
>> }
>> @@ -1269,20 +1271,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>> * mmap_lock.
>> */
>> mmap_write_lock(mm);
>> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>> - HPAGE_PMD_ORDER);
>> + result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>> + &vma, cc, order);
>> if (result != SCAN_SUCCEED)
>> goto out_up_write;
>> /* check if the pmd is still valid */
>> vma_start_write(vma);
>> - result = check_pmd_still_valid(mm, address, pmd);
>> + result = check_pmd_still_valid(mm, pmd_addr, pmd);
>> if (result != SCAN_SUCCEED)
>> goto out_up_write;
>>
>> anon_vma_lock_write(vma->anon_vma);
>> + anon_vma_locked = true;
>>
>> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
>> - address + HPAGE_PMD_SIZE);
>> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
>> + end_addr);
>> mmu_notifier_invalidate_range_start(&range);
>>
>> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
>> @@ -1294,26 +1297,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>> * Parallel GUP-fast is fine since GUP-fast will back off when
>> * it detects PMD is changed.
>> */
>> - _pmd = pmdp_collapse_flush(vma, address, pmd);
>> + _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
>> spin_unlock(pmd_ptl);
>> mmu_notifier_invalidate_range_end(&range);
>> tlb_remove_table_sync_one();
>>
>> - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>> + pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
>> if (pte) {
>> - result = __collapse_huge_page_isolate(vma, address, pte, cc,
>> - HPAGE_PMD_ORDER,
>> - &compound_pagelist);
>> + result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
>> + order, &compound_pagelist);
>> spin_unlock(pte_ptl);
>> } else {
>> result = SCAN_NO_PTE_TABLE;
>> }
>>
>> if (unlikely(result != SCAN_SUCCEED)) {
>> - if (pte)
>> - pte_unmap(pte);
>> spin_lock(pmd_ptl);
>> - BUG_ON(!pmd_none(*pmd));
>> + WARN_ON_ONCE(!pmd_none(*pmd));
>> /*
>> * We can only use set_pmd_at when establishing
>> * hugepmds and never for establishing regular pmds that
>> @@ -1321,21 +1321,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>> */
>> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>> spin_unlock(pmd_ptl);
>> - anon_vma_unlock_write(vma->anon_vma);
>> goto out_up_write;
>> }
>>
>> /*
>> - * All pages are isolated and locked so anon_vma rmap
>> - * can't run anymore.
>> + * For PMD collapse all pages are isolated and locked so anon_vma
>> + * rmap can't run anymore. For mTHP collapse the PMD entry has been
>> + * removed and not all pages are isolated and locked, so we must hold
>> + * the lock to prevent neighboring folios from attempting to access
>> + * this PMD until its reinstalled.
>> */
>> - anon_vma_unlock_write(vma->anon_vma);
>> + if (is_pmd_order(order)) {
>> + anon_vma_unlock_write(vma->anon_vma);
>> + anon_vma_locked = false;
>> + }
>>
>> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>> - vma, address, pte_ptl,
>> - HPAGE_PMD_ORDER,
>> - &compound_pagelist);
>> - pte_unmap(pte);
>> + vma, start_addr, pte_ptl,
>> + order, &compound_pagelist);
>> if (unlikely(result != SCAN_SUCCEED))
>> goto out_up_write;
>>
>> @@ -1345,18 +1348,32 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>> * write.
>> */
>> __folio_mark_uptodate(folio);
>> - pgtable = pmd_pgtable(_pmd);
>> -
>> spin_lock(pmd_ptl);
>> - BUG_ON(!pmd_none(*pmd));
>> - pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
>> + WARN_ON_ONCE(!pmd_none(*pmd));
>> + if (is_pmd_order(order)) {
>> + pgtable = pmd_pgtable(_pmd);
>> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
>> + } else {
>> + /*
>> + * set_ptes is called in map_anon_folio_pte_nopf with the
>> + * pmd_ptl lock still held; this is safe as the PMD is expected
>> + * to be none. The pmd entry is then repopulated below.
>> + */
>> + map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
>
> Emm ... is it safe to use map_anon_folio_pte_nopf() here?
>
> At this point pmdp_collapse_flush() has cleared the PMD from the page
> tables. The PTE table we are updating is only reachable through the saved
> old PMD value, _pmd, until pmd_populate() below.
>
> map_anon_folio_pte_nopf() does set_ptes() and then calls
> update_mmu_cache_range(). Documentation/core-api/cachetlb.rst describes
> that hook as:
>
> "
> At the end of every page fault, this routine is invoked to tell
> the architecture specific code that translations now exists
> in the software page tables for address space "vma->vm_mm"
> at virtual address "address" for "nr" consecutive pages.
> "
>
> But that does not seem true here yet, since the PTE table is not
> reachable from vma->vm_mm when update_mmu_cache_range() is called.
>
> Should we avoid calling update_mmu_cache_range() until after the PTE
> table is reinstalled with pmd_populate()?

I recall that update_mmu_cache* users mostly care about updating folios flags,
for the folio derived from the PTE ... or flushing caches for the user address.

So intuitively I would say "the architecture code doesn't care that the PMD
table will only be visible to HW shortly after". The important thing should be
that it will definetly happen, and that nothing else is curently there or can be
there?

--
Cheers,

David