[PATCH v3 09/13] mm/khugepaged: retract_page_tables() without mmap or vma lock

From: Hugh Dickins
Date: Wed Jul 12 2023 - 00:41:17 EST


Simplify shmem and file THP collapse's retract_page_tables(), and relax
its locking: to improve its success rate and to lessen impact on others.

Instead of its MADV_COLLAPSE case doing set_huge_pmd() at target_addr of
target_mm, leave that part of the work to madvise_collapse() calling
collapse_pte_mapped_thp() afterwards: just adjust collapse_file()'s
result code to arrange for that. That spares retract_page_tables() four
arguments; and since it will be successful in retracting all of the page
tables expected of it, no need to track and return a result code itself.

It needs i_mmap_lock_read(mapping) for traversing the vma interval tree,
but it does not need i_mmap_lock_write() for that: page_vma_mapped_walk()
allows for pte_offset_map_lock() etc to fail, and uses pmd_lock() for
THPs. retract_page_tables() just needs to use those same spinlocks to
exclude it briefly, while transitioning pmd from page table to none: so
restore its use of pmd_lock() inside of which pte lock is nested.

Users of pte_offset_map_lock() etc all now allow for them to fail:
so retract_page_tables() now has no use for mmap_write_trylock() or
vma_try_start_write(). In common with rmap and page_vma_mapped_walk(),
it does not even need the mmap_read_lock().

But those users do expect the page table to remain a good page table,
until they unlock and rcu_read_unlock(): so the page table cannot be
freed immediately, but rather by the recently added pte_free_defer().

Use the (usually a no-op) pmdp_get_lockless_sync() to send an interrupt
when PAE, and pmdp_collapse_flush() did not already do so: to make sure
that the start,pmdp_get_lockless(),end sequence in __pte_offset_map()
cannot pick up a pmd entry with mismatched pmd_low and pmd_high.

retract_page_tables() can be enhanced to replace_page_tables(), which
inserts the final huge pmd without mmap lock: going through an invalid
state instead of pmd_none() followed by fault. But that enhancement
does raise some more questions: leave it until a later release.

Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
---
mm/khugepaged.c | 184 ++++++++++++++++++++------------------------------
1 file changed, 75 insertions(+), 109 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 78c8d5d8b628..3bb05147961b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1615,9 +1615,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
break;
case SCAN_PMD_NONE:
/*
- * In MADV_COLLAPSE path, possible race with khugepaged where
- * all pte entries have been removed and pmd cleared. If so,
- * skip all the pte checks and just update the pmd mapping.
+ * All pte entries have been removed and pmd cleared.
+ * Skip all the pte checks and just update the pmd mapping.
*/
goto maybe_install_pmd;
default:
@@ -1748,123 +1747,88 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl
mmap_write_unlock(mm);
}

-static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
- struct mm_struct *target_mm,
- unsigned long target_addr, struct page *hpage,
- struct collapse_control *cc)
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
- int target_result = SCAN_FAIL;

- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- int result = SCAN_FAIL;
- struct mm_struct *mm = NULL;
- unsigned long addr = 0;
- pmd_t *pmd;
- bool is_target = false;
+ struct mmu_notifier_range range;
+ struct mm_struct *mm;
+ unsigned long addr;
+ pmd_t *pmd, pgt_pmd;
+ spinlock_t *pml;
+ spinlock_t *ptl;
+ bool skipped_uffd = false;

/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth investing
- * mmap_write_lock(mm) as PMD-mapping is likely to be split
- * later.
- *
- * Note that vma->anon_vma check is racy: it can be set up after
- * the check but before we took mmap_lock by the fault path.
- * But page lock would prevent establishing any new ptes of the
- * page, so we are safe.
- *
- * An alternative would be drop the check, but check that page
- * table is clear before calling pmdp_collapse_flush() under
- * ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too. It would also probably require locking
- * the anon_vma.
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
*/
- if (READ_ONCE(vma->anon_vma)) {
- result = SCAN_PAGE_ANON;
- goto next;
- }
+ if (READ_ONCE(vma->anon_vma))
+ continue;
+
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
- vma->vm_end < addr + HPAGE_PMD_SIZE) {
- result = SCAN_VMA_CHECK;
- goto next;
- }
- mm = vma->vm_mm;
- is_target = mm == target_mm && addr == target_addr;
- result = find_pmd_or_thp_or_none(mm, addr, &pmd);
- if (result != SCAN_SUCCEED)
- goto next;
- /*
- * We need exclusive mmap_lock to retract page table.
- *
- * We use trylock due to lock inversion: we need to acquire
- * mmap_lock while holding page lock. Fault path does it in
- * reverse order. Trylock is a way to avoid deadlock.
- *
- * Also, it's not MADV_COLLAPSE's job to collapse other
- * mappings - let khugepaged take care of them later.
- */
- result = SCAN_PTE_MAPPED_HUGEPAGE;
- if ((cc->is_khugepaged || is_target) &&
- mmap_write_trylock(mm)) {
- /* trylock for the same lock inversion as above */
- if (!vma_try_start_write(vma))
- goto unlock_next;
-
- /*
- * Re-check whether we have an ->anon_vma, because
- * collapse_and_free_pmd() requires that either no
- * ->anon_vma exists or the anon_vma is locked.
- * We already checked ->anon_vma above, but that check
- * is racy because ->anon_vma can be populated under the
- * mmap lock in read mode.
- */
- if (vma->anon_vma) {
- result = SCAN_PAGE_ANON;
- goto unlock_next;
- }
- /*
- * When a vma is registered with uffd-wp, we can't
- * recycle the pmd pgtable because there can be pte
- * markers installed. Skip it only, so the rest mm/vma
- * can still have the same file mapped hugely, however
- * it'll always mapped in small page size for uffd-wp
- * registered ranges.
- */
- if (hpage_collapse_test_exit(mm)) {
- result = SCAN_ANY_PROCESS;
- goto unlock_next;
- }
- if (userfaultfd_wp(vma)) {
- result = SCAN_PTE_UFFD_WP;
- goto unlock_next;
- }
- collapse_and_free_pmd(mm, vma, addr, pmd);
- if (!cc->is_khugepaged && is_target)
- result = set_huge_pmd(vma, addr, pmd, hpage);
- else
- result = SCAN_SUCCEED;
-
-unlock_next:
- mmap_write_unlock(mm);
- goto next;
- }
- /*
- * Calling context will handle target mm/addr. Otherwise, let
- * khugepaged try again later.
- */
- if (!is_target) {
- khugepaged_add_pte_mapped_thp(mm, addr);
+ vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
+
+ mm = vma->vm_mm;
+ if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
+ continue;
+
+ if (hpage_collapse_test_exit(mm))
+ continue;
+ /*
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
+ */
+ if (userfaultfd_wp(vma))
+ continue;
+
+ /* PTEs were notified when unmapped; but now for the PMD? */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ addr, addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ pml = pmd_lock(mm, pmd);
+ ptl = pte_lockptr(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ /*
+ * Huge page lock is still held, so normally the page table
+ * must remain empty; and we have already skipped anon_vma
+ * and userfaultfd_wp() vmas. But since the mmap_lock is not
+ * held, it is still possible for a racing userfaultfd_ioctl()
+ * to have inserted ptes or markers. Now that we hold ptlock,
+ * repeating the anon_vma check protects from one category,
+ * and repeating the userfaultfd_wp() check from another.
+ */
+ if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
+ skipped_uffd = true;
+ } else {
+ pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
+ pmdp_get_lockless_sync();
+ }
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ if (!skipped_uffd) {
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, addr, pgt_pmd);
+ pte_free_defer(mm, pmd_pgtable(pgt_pmd));
}
-next:
- if (is_target)
- target_result = result;
}
- i_mmap_unlock_write(mapping);
- return target_result;
+ i_mmap_unlock_read(mapping);
}

/**
@@ -2259,9 +2223,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,

/*
* Remove pte page tables, so we can re-fault the page as huge.
+ * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
*/
- result = retract_page_tables(mapping, start, mm, addr, hpage,
- cc);
+ retract_page_tables(mapping, start);
+ if (cc && !cc->is_khugepaged)
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
unlock_page(hpage);

/*
--
2.35.3