[PATCH 10/12] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()

From: Hugh Dickins
Date: Mon May 29 2023 - 02:27:37 EST


Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp().
It does need mmap_read_lock(), but it does not need mmap_write_lock(),
nor vma_start_write() nor i_mmap lock nor anon_vma lock. All racing
paths are relying on pte_offset_map_lock() and pmd_lock(), so use those.
Follow the pattern in retract_page_tables(); and using pte_free_defer()
removes the need for tlb_remove_table_sync_one() here.

Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been
acquired and the page looks suitable: from then on its state is stable.

However, collapse_pte_mapped_thp() was doing something others don't:
freeing a page table still containing "valid" entries. i_mmap lock did
stop a racing truncate from double-freeing those pages, but we prefer
collapse_pte_mapped_thp() to clear the entries as usual. Their TLB
flush can wait until the pmdp_collapse_flush() which follows, but the
mmu_notifier_invalidate_range_start() has to be done earlier.

Some cleanup while rearranging: rename "count" to "nr_ptes";
and "step 2" does not need to duplicate the checks in "step 1".

Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
---
mm/khugepaged.c | 131 +++++++++++++++---------------------------------
1 file changed, 41 insertions(+), 90 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4fd408154692..2999500abdd5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
return ret;
}

-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct page *hpage)
{
@@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
};

VM_BUG_ON(!PageTransHuge(hpage));
- mmap_assert_write_locked(vma->vm_mm);
+ mmap_assert_locked(vma->vm_mm);

if (do_set_pmd(&vmf, hpage))
return SCAN_FAIL;
@@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED;
}

-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- * - modifying terminal entries (ones that point to a data page, not to another
- * page table)
- * - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t pmd;
- struct mmu_notifier_range range;
-
- mmap_assert_write_locked(mm);
- if (vma->vm_file)
- lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
- /*
- * All anon_vmas attached to the VMA have the same root and are
- * therefore locked by the same lock.
- */
- if (vma->anon_vma)
- lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
- pmd = pmdp_collapse_flush(vma, addr, pmdp);
- tlb_remove_table_sync_one();
- mmu_notifier_invalidate_range_end(&range);
- mm_dec_nr_ptes(mm);
- page_table_check_pte_clear_range(mm, addr, pmd);
- pte_free(mm, pmd_pgtable(pmd));
-}
-
/**
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
* address haddr.
@@ -1563,16 +1521,17 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ struct mmu_notifier_range range;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
- pmd_t *pmd;
- spinlock_t *ptl;
- int count = 0, result = SCAN_FAIL;
+ pmd_t *pmd, pgt_pmd;
+ spinlock_t *pml, *ptl;
+ int nr_ptes = 0, result = SCAN_FAIL;
int i;

- mmap_assert_write_locked(mm);
+ mmap_assert_locked(mm);

/* Fast check before locking page if already PMD-mapped */
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
@@ -1612,6 +1571,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}

+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
switch (result) {
case SCAN_SUCCEED:
break;
@@ -1625,27 +1585,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}

- /* Lock the vma before taking i_mmap and page table locks */
- vma_start_write(vma);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);

- /*
- * We need to lock the mapping so that from here on, only GUP-fast and
- * hardware page walks can access the parts of the page tables that
- * we're operating on.
- * See collapse_and_free_pmd().
- */
- i_mmap_lock_write(vma->vm_file->f_mapping);
-
- /*
- * This spinlock should be unnecessary: Nobody else should be accessing
- * the page tables under spinlock protection here, only
- * lockless_pages_from_mm() and the hardware page walker can access page
- * tables while all the high-level locks are held in write mode.
- */
result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
- if (!start_pte)
- goto drop_immap;
+ if (!start_pte) /* mmap_lock + page lock should prevent this */
+ goto abort;

/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1671,40 +1618,44 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
*/
if (hpage + i != page)
goto abort;
- count++;
+ nr_ptes++;
}

- /* step 2: adjust rmap */
+ /* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
- struct page *page;
-
if (pte_none(*pte))
continue;
- page = vm_normal_page(vma, addr, *pte);
- if (WARN_ON_ONCE(page && is_zone_device_page(page)))
- goto abort;
- page_remove_rmap(page, vma, false);
+
+ /* Must clear entry, or a racing truncate may re-remove it */
+ pte_clear(mm, addr, pte);
+ page_remove_rmap(hpage + i, vma, false);
}

pte_unmap_unlock(start_pte, ptl);

/* step 3: set proper refcount and mm_counters. */
- if (count) {
- page_ref_sub(hpage, count);
- add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ if (nr_ptes) {
+ page_ref_sub(hpage, nr_ptes);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -nr_ptes);
}

- /* step 4: remove pte entries */
- /* we make no change to anon, but protect concurrent anon page lookup */
- if (vma->anon_vma)
- anon_vma_lock_write(vma->anon_vma);
+ /* step 4: remove page table */

- collapse_and_free_pmd(mm, vma, haddr, pmd);
+ /* Huge page lock is still held, so page table must remain empty */
+ pml = pmd_lock(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);

- if (vma->anon_vma)
- anon_vma_unlock_write(vma->anon_vma);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ mmu_notifier_invalidate_range_end(&range);
+
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
+ pte_free_defer(mm, pmd_pgtable(pgt_pmd));

maybe_install_pmd:
/* step 5: install pmd entry */
@@ -1718,9 +1669,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
return result;

abort:
- pte_unmap_unlock(start_pte, ptl);
-drop_immap:
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ mmu_notifier_invalidate_range_end(&range);
goto drop_hpage;
}

@@ -2842,9 +2793,9 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
BUG_ON(*prev);
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
- mmap_write_unlock(mm);
+ mmap_locked = true;
goto handle_result;
/* Whitelisted set of results where continuing OK */
case SCAN_PMD_NULL:
--
2.35.3