Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault

From: Dev Jain
Date: Thu Sep 05 2024 - 05:53:59 EST



On 9/5/24 15:11, Kefeng Wang wrote:


On 2024/9/5 16:26, Kirill A. Shutemov wrote:
On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
replace it with a PMD-mapped THP. Change the helpers introduced in the
previous patch to flush TLB entry corresponding to the hugezeropage,
and preserve PMD uffd-wp marker. In case of failure, fallback to
splitting the PMD.

We met same issue, and a very similar function in our kernel,

Signed-off-by: Dev Jain <dev.jain@xxxxxxx>
---
  include/linux/huge_mm.h |  6 ++++
  mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
  mm/memory.c             |  5 +--
  3 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..fdd2cf473a3c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -9,6 +9,12 @@
  #include <linux/kobject.h>
    vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+               unsigned long haddr, struct folio **foliop,
+               unsigned long addr);
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+         struct vm_area_struct *vma, unsigned long haddr,
+         pgtable_t pgtable);

Why? I don't see users outside huge_memory.c.

  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
            pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
            struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 58125fbcc532..150163ad77d3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  }
  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
  -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
-                  unsigned long haddr, struct folio **foliop,
-                  unsigned long addr)
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+               unsigned long haddr, struct folio **foliop,
+               unsigned long addr)
  {
      struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
  @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
      count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
  }
  -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
-            struct vm_area_struct *vma, unsigned long haddr,
-            pgtable_t pgtable)
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+         struct vm_area_struct *vma, unsigned long haddr,
+         pgtable_t pgtable)
  {
-    pmd_t entry;
+    pmd_t entry, old_pmd;
+    bool is_pmd_none = pmd_none(*vmf->pmd);
        entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
      entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
      folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
      folio_add_lru_vma(folio, vma);
-    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+    if (!is_pmd_none) {
+        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+        if (pmd_uffd_wp(old_pmd))
+            entry = pmd_mkuffd_wp(entry);
+    }
+    if (pgtable)
+        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
      set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
      update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
      add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-    mm_inc_nr_ptes(vma->vm_mm);
+    if (is_pmd_none)
+        mm_inc_nr_ptes(vma->vm_mm);
  }
    static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
      spin_unlock(vmf->ptl);
  }
  +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
+                         unsigned long haddr,
+                         struct folio *folio)

Why the helper is needed? Cannot it be just opencodded in
do_huge_zero_wp_pmd()?

+{
+    struct vm_area_struct *vma = vmf->vma;
+    vm_fault_t ret = 0;
+
+    ret = check_stable_address_space(vma->vm_mm);
+    if (ret)
+        goto out;
+    map_pmd_thp(folio, vmf, vma, haddr, NULL);
+out:
+    return ret;
+}
+
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
+{
+    struct vm_area_struct *vma = vmf->vma;
+    gfp_t gfp = vma_thp_gfp_mask(vma);
+    struct mmu_notifier_range range;
+    struct folio *folio = NULL;
+    vm_fault_t ret = 0;
+
+    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
+                  vmf->address);
+    if (ret)
+        goto out;
+
+    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+                haddr + HPAGE_PMD_SIZE);
+    mmu_notifier_invalidate_range_start(&range);
+    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+        goto unlock;
+    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
+    if (!ret)
+        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
+unlock:
+    spin_unlock(vmf->ptl);
+    mmu_notifier_invalidate_range_end(&range);

the folio need to released when !pmd_same() and check_stable_address_space() return VM_FAULT_SIGBUS.

Yes, thanks.

+out:
+    return ret;
+}
+