Re: [RFC PATCH 08/12] khugepaged: Abstract PMD-THP collapse

From: Ryan Roberts
Date: Tue Dec 17 2024 - 14:24:37 EST


On 16/12/2024 16:51, Dev Jain wrote:
> Abstract away taking the mmap_lock exclusively, copying page contents, and
> setting the PMD, into vma_collapse_anon_folio_pmd().
>
> Signed-off-by: Dev Jain <dev.jain@xxxxxxx>
> ---
> mm/khugepaged.c | 119 +++++++++++++++++++++++++++---------------------
> 1 file changed, 66 insertions(+), 53 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 078794aa3335..88beebef773e 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> return SCAN_SUCCEED;
> }
>
> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> - int referenced, int unmapped, int order,
> - struct collapse_control *cc)
> +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address,
> + struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
> + struct folio *folio)
> {
> + struct mmu_notifier_range range;
> + spinlock_t *pmd_ptl, *pte_ptl;
> LIST_HEAD(compound_pagelist);
> - pmd_t *pmd, _pmd;
> - pte_t *pte;
> pgtable_t pgtable;
> - struct folio *folio;
> - spinlock_t *pmd_ptl, *pte_ptl;
> - int result = SCAN_FAIL;
> - struct vm_area_struct *vma;
> - struct mmu_notifier_range range;
> -
> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -
> - /*
> - * Before allocating the hugepage, release the mmap_lock read lock.
> - * The allocation can take potentially a long time if it involves
> - * sync compaction, and we do not need to hold the mmap_lock during
> - * that. We will recheck the vma after taking it again in write mode.
> - */
> - mmap_read_unlock(mm);
> -
> - result = alloc_charge_folio(&folio, mm, order, cc);
> - if (result != SCAN_SUCCEED)
> - goto out_nolock;
> -
> - mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
> - if (result != SCAN_SUCCEED) {
> - mmap_read_unlock(mm);
> - goto out_nolock;
> - }
> -
> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> - if (result != SCAN_SUCCEED) {
> - mmap_read_unlock(mm);
> - goto out_nolock;
> - }
> -
> - if (unmapped) {
> - /*
> - * __collapse_huge_page_swapin will return with mmap_lock
> - * released when it fails. So we jump out_nolock directly in
> - * that case. Continuing to collapse causes inconsistency.
> - */
> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced, order);
> - if (result != SCAN_SUCCEED)
> - goto out_nolock;
> - }
> + int result;
> + pmd_t _pmd;
> + pte_t *pte;
>
> mmap_read_unlock(mm);
> /*
> @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * mmap_lock.
> */
> mmap_write_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
> +
> + result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - &compound_pagelist, order);
> + &compound_pagelist, HPAGE_PMD_ORDER);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_PMD_NULL;
> @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> deferred_split_folio(folio, false);
> spin_unlock(pmd_ptl);
>
> - folio = NULL;
> -
> result = SCAN_SUCCEED;
> out_up_write:
> mmap_write_unlock(mm);
> + return result;
> +}
> +
> +static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> + int referenced, int unmapped, int order,
> + struct collapse_control *cc)
> +{
> + struct vm_area_struct *vma;
> + int result = SCAN_FAIL;
> + struct folio *folio;
> + pmd_t *pmd;
> +
> + /*
> + * Before allocating the hugepage, release the mmap_lock read lock.
> + * The allocation can take potentially a long time if it involves
> + * sync compaction, and we do not need to hold the mmap_lock during
> + * that. We will recheck the vma after taking it again in write mode.
> + */
> + mmap_read_unlock(mm);
> +
> + result = alloc_charge_folio(&folio, mm, order, cc);
> + if (result != SCAN_SUCCEED)
> + goto out_nolock;
> +
> + mmap_read_lock(mm);
> + result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
> + if (result != SCAN_SUCCEED) {
> + mmap_read_unlock(mm);
> + goto out_nolock;
> + }
> +
> + result = find_pmd_or_thp_or_none(mm, address, &pmd);
> + if (result != SCAN_SUCCEED) {
> + mmap_read_unlock(mm);
> + goto out_nolock;
> + }
> +
> + if (unmapped) {
> + /*
> + * __collapse_huge_page_swapin will return with mmap_lock
> + * released when it fails. So we jump out_nolock directly in
> + * that case. Continuing to collapse causes inconsistency.
> + */
> + result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> + referenced, order);
> + if (result != SCAN_SUCCEED)
> + goto out_nolock;
> + }
> +
> + if (order == HPAGE_PMD_ORDER)
> + result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);

I think the locking is broken here? collapse_huge_page() used to enter with the
mmamp read lock and exit without the lock held at all. After the change, this is
only true for order == HPAGE_PMD_ORDER. For other orders, you exit with the mmap
read lock still held. Perhaps:

else
mmap_read_unlock(mm);

> +
> + if (result == SCAN_SUCCEED)
> + folio = NULL;
> +
> out_nolock:
> if (folio)
> folio_put(folio);