Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation

From: Lance Yang
Date: Thu Jun 26 2025 - 09:53:51 EST




On 2025/6/26 21:16, David Hildenbrand wrote:
On 26.06.25 14:44, Lance Yang wrote:

On 2025/6/26 17:29, Lance Yang wrote:
Before I send out the real patch, I'd like to get some quick feedback to
ensure I've understood the discussion correctly ;)

Does this look like the right direction?

diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..5ebffe2137e4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
   #endif
   }
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
-            struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+            struct page_vma_mapped_walk *pvmw,
+            enum ttu_flags flags, pte_t pte)
   {
       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
-    int max_nr = folio_nr_pages(folio);
-    pte_t pte = ptep_get(ptep);
+    unsigned long end_addr, addr = pvmw->address;
+    struct vm_area_struct *vma = pvmw->vma;
+    unsigned int max_nr;
+
+    if (flags & TTU_HWPOISON)
+        return 1;
+    if (!folio_test_large(folio))
+        return 1;
+    /* We may only batch within a single VMA and a single page table. */
+    end_addr = pmd_addr_end(addr, vma->vm_end);
+    max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+    /* We only support lazyfree batching for now ... */
       if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
-        return false;
+        return 1;
       if (pte_unused(pte))
-        return false;
-    if (pte_pfn(pte) != folio_pfn(folio))
-        return false;
+        return 1;
+
+    /* ... where we must be able to batch the whole folio. */
+    if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
+        return 1;
+    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+                 NULL, NULL, NULL);
-    return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
-                   NULL, NULL) == max_nr;
+    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
   }
   /*
@@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
               if (pte_dirty(pteval))
                   folio_mark_dirty(folio);
           } else if (likely(pte_present(pteval))) {
-            if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
-                can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
-                nr_pages = folio_nr_pages(folio);
+            nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
               end_addr = address + nr_pages * PAGE_SIZE;
               flush_cache_range(vma, address, end_addr);
@@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
               hugetlb_remove_rmap(folio);
           } else {
               folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
-            folio_ref_sub(folio, nr_pages - 1);
           }
           if (vma->vm_flags & VM_LOCKED)
               mlock_drain_local();
-        folio_put(folio);
-        /* We have already batched the entire folio */
-        if (nr_pages > 1)
+        folio_put_refs(folio, nr_pages);
+
+        /*
+         * If we are sure that we batched the entire folio and cleared
+         * all PTEs, we can just optimize and stop right here.
+         */
+        if (nr_pages == folio_nr_pages(folio))
               goto walk_done;
           continue;
   walk_abort:
--

Oops ... Through testing on my machine, I found that the logic doesn't
behave as expected because I messed up the meaning of max_nr (the available
scan room in the page table) with folio_nr_pages(folio) :(

With the following change:

diff --git a/mm/rmap.c b/mm/rmap.c
index 5ebffe2137e4..b1407348e14e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1850,9 +1850,9 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
              enum ttu_flags flags, pte_t pte)
  {
      const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+    unsigned int max_nr, nr_pages = folio_nr_pages(folio);
      unsigned long end_addr, addr = pvmw->address;
      struct vm_area_struct *vma = pvmw->vma;
-    unsigned int max_nr;
      if (flags & TTU_HWPOISON)
          return 1;
@@ -1870,12 +1870,13 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
          return 1;
      /* ... where we must be able to batch the whole folio. */

Why is that still required? :)

Sorry ... I was still stuck in the "all-or-nothing" mindset ...

So, IIUC, you mean we should completely remove the "max_nr < nr_pages"
check and just let folio_pte_batch handle whatever partial batch it
safely can.


-    if (pte_pfn(pte) != folio_pfn(folio) || max_nr != folio_nr_pages(folio))
+    if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
          return 1;
-    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
-                 NULL, NULL, NULL);
-    return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
+    max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
+                 fpb_flags, NULL, NULL, NULL);
+
+    return (max_nr != nr_pages) ? 1 : max_nr;

Why is that still required? :)

Then simply return the number of PTEs that consecutively map to the
large folio. Right?