[PATCH v4 12/12] mm/rmap: enable batch unmapping of anonymous folios

From: Dev Jain

Date: Tue May 26 2026 - 02:40:58 EST

Enable batch clearing of ptes, and batch swap setting of ptes for anon
folio unmapping.

Processing all ptes of a large folio in one go helps us batch across
atomics (add_mm_counter etc), barriers (in the function
__folio_try_share_anon_rmap), repeated calls to page_vma_mapped_walk(),
to name a few. In general, batching helps us to execute similar code
together, making the execution of the program more memory and
CPU friendly.

On arm64-contpte, batching also helps us avoid redundant ptep_get() calls
and TLB flushes while breaking the contpte mapping.

The handling of anon-exclusivity is very similar to commit cac1db8c3aad
("mm: optimize mprotect() by PTE batching"). Since folio_unmap_pte_batch()
won't look at the bits of the underlying page, we need to process
sub-batches of ptes pointing to pages which are same w.r.t exclusivity,
and batch set only those ptes to swap ptes in one go. Hence export
page_anon_exclusive_batch() to internal.h and reuse it.

arch_unmap_one() is only defined for sparc64; I am not comfortable
regarding the nuances between retrieving the pfn from pte_pfn() or from
(paddr = pte_val(oldpte) & _PAGE_PADDR_4V).

(And, pte_next_pfn() can't even be called from arch_unmap_one() because
that file does not include pgtable.h) So just disable the
"sparc64-anon-swapbacked" case for now.

We need to take care of rmap accounting (folio_remove_rmap_ptes) and
reference accounting (folio_put_refs) when anon folio unmap succeeds.
In case we partially batch the large folio and fail, we need to correctly
do the accounting for pages which were successfully unmapped. So, put
this accounting code (finish_folio_unmap()) in __ttu_anon_folio() itself,
instead of doing some horrible goto jumping at the callsite
of ttu_anon_folio().

If the batch length is less than the number of pages in the folio, then
we must skip over this batch.

The page_vma_mapped_walk API ensures this - check_pte() will return true
only if any of [pvmw->pfn, pvmw->pfn + nr_pages) is mapped by the pte.
There is no pfn underlying a swap pte, so check_pte returns false and we
keep skipping until we hit a present pte, which is where we want to start
unmapping from next.

Signed-off-by: Dev Jain <dev.jain@xxxxxxx>
---
mm/internal.h | 17 +++++++++
mm/mprotect.c | 17 ---------
mm/rmap.c | 98 +++++++++++++++++++++++++++++++++++++++------------
3 files changed, 92 insertions(+), 40 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b6..87a61742d1920 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -393,6 +393,23 @@ static inline unsigned int folio_pte_batch_flags(struct folio *folio,
unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
unsigned int max_nr);

+/*
+ * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or
+ * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
+ * that the ptes point to consecutive pages of the same anon large folio.
+ */
+static __always_inline int page_anon_exclusive_batch(int start_idx, int max_len,
+ struct page *first_page, bool expected_anon_exclusive)
+{
+ int idx;
+
+ for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) {
+ if (expected_anon_exclusive != PageAnonExclusive(first_page + idx))
+ break;
+ }
+ return idx - start_idx;
+}
+
/**
* pte_move_swp_offset - Move the swap entry offset field of a swap pte
* forward or backward by delta
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3357058672016..950af1efdd661 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -138,23 +138,6 @@ static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE);
}

-/*
- * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or
- * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
- * that the ptes point to consecutive pages of the same anon large folio.
- */
-static __always_inline int page_anon_exclusive_batch(int start_idx, int max_len,
- struct page *first_page, bool expected_anon_exclusive)
-{
- int idx;
-
- for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) {
- if (expected_anon_exclusive != PageAnonExclusive(first_page + idx))
- break;
- }
- return idx - start_idx;
-}
-
/*
* This function is a result of trying our very best to retain the
* "avoid the write-fault handler" optimization. In can_change_pte_writable(),
diff --git a/mm/rmap.c b/mm/rmap.c
index b1639bad8e27f..e02d81840018c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1958,13 +1958,14 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
end_addr = pmd_addr_end(addr, vma->vm_end);
max_nr = (end_addr - addr) >> PAGE_SHIFT;

- /* We only support lazyfree or file folios batching for now ... */
- if (folio_test_anon(folio) && folio_test_swapbacked(folio))
- return 1;

if (pte_unused(pte))
return 1;

+ if (__is_defined(__HAVE_ARCH_UNMAP_ONE) && folio_test_anon(folio) &&
+ folio_test_swapbacked(folio))
+ return 1;
+
/*
* If unmap fails, we need to restore the ptes. To avoid accidentally
* upgrading write permissions for ptes that were not originally
@@ -2136,8 +2137,9 @@ static inline bool ttu_lazyfree_folio(struct vm_area_struct *vma,
return true;
}

-static inline void set_swp_pte_at(struct mm_struct *mm, unsigned long address,
- pte_t *ptep, swp_entry_t entry, pte_t pteval, bool anon_exclusive)
+static inline void set_swp_ptes(struct mm_struct *mm, unsigned long address,
+ pte_t *ptep, swp_entry_t entry, pte_t pteval, bool anon_exclusive,
+ unsigned long nr_pages)
{
pte_t swp_pte = swp_entry_to_pte(entry);

@@ -2151,24 +2153,37 @@ static inline void set_swp_pte_at(struct mm_struct *mm, unsigned long address,
swp_pte = pte_swp_mkuffd_wp(swp_pte);
} else {
/* Device-exclusive entry */
+ VM_WARN_ON(nr_pages != 1);
if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
}

- set_pte_at(mm, address, ptep, swp_pte);
+ for (int i = 0; i < nr_pages; ++i, ++ptep, address += PAGE_SIZE) {
+ set_pte_at(mm, address, ptep, swp_pte);
+ swp_pte = pte_next_swp_offset(swp_pte);
+ }
}

-static inline bool ttu_anon_folio(struct vm_area_struct *vma, struct folio *folio,
+static inline void finish_folio_unmap(struct vm_area_struct *vma,
+ struct folio *folio, struct page *subpage,
+ unsigned long nr_pages)
+{
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_drain_local();
+ folio_put_refs(folio, nr_pages);
+}
+
+static inline bool __ttu_anon_folio(struct vm_area_struct *vma, struct folio *folio,
struct page *subpage, unsigned long address, pte_t *ptep,
- pte_t pteval)
+ pte_t pteval, unsigned long nr_pages, bool anon_exclusive)
{
- bool anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage);
swp_entry_t entry = page_swap_entry(subpage);
struct mm_struct *mm = vma->vm_mm;

- if (folio_dup_swap_pages(folio, subpage, 1) < 0)
+ if (folio_dup_swap_pages(folio, subpage, nr_pages) < 0)
return false;

/*
@@ -2177,13 +2192,14 @@ static inline bool ttu_anon_folio(struct vm_area_struct *vma, struct folio *foli
* so we'll not check/care.
*/
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- folio_put_swap_pages(folio, subpage, 1);
+ VM_WARN_ON(nr_pages != 1);
+ folio_put_swap_pages(folio, subpage, nr_pages);
return false;
}

/* See folio_try_share_anon_rmap(): clear PTE first. */
- if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) {
- folio_put_swap_pages(folio, subpage, 1);
+ if (anon_exclusive && folio_try_share_anon_rmap_ptes(folio, subpage, nr_pages)) {
+ folio_put_swap_pages(folio, subpage, nr_pages);
return false;
}

@@ -2194,9 +2210,49 @@ static inline bool ttu_anon_folio(struct vm_area_struct *vma, struct folio *foli
spin_unlock(&mmlist_lock);
}

- dec_mm_counter(mm, MM_ANONPAGES);
- inc_mm_counter(mm, MM_SWAPENTS);
- set_swp_pte_at(mm, address, ptep, entry, pteval, anon_exclusive);
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
+ add_mm_counter(mm, MM_SWAPENTS, nr_pages);
+ set_swp_ptes(mm, address, ptep, entry, pteval, anon_exclusive, nr_pages);
+ finish_folio_unmap(vma, folio, subpage, nr_pages);
+ return true;
+}
+
+/*
+ * Unmap an anonymous folio from the pagetables of a process. The function
+ * may partially succeed in unmapping some pages out of nr_pages, in which
+ * case it will restore the remaining ptes and return false. Returns true
+ * if all of nr_pages were unmapped.
+ */
+static inline bool ttu_anon_folio(struct vm_area_struct *vma, struct folio *folio,
+ struct page *first_page, unsigned long address, pte_t *ptep,
+ pte_t pteval, unsigned long nr_pages)
+{
+ bool expected_anon_exclusive;
+ int sub_batch_idx = 0;
+ int len, ret;
+
+ for (;;) {
+ expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx);
+ len = page_anon_exclusive_batch(sub_batch_idx, nr_pages,
+ first_page, expected_anon_exclusive);
+ ret = __ttu_anon_folio(vma, folio, first_page + sub_batch_idx,
+ address, ptep, pteval, len, expected_anon_exclusive);
+ if (!ret) {
+ /* restore the remaining ptes which got cleared */
+ set_ptes(vma->vm_mm, address, ptep, pteval, nr_pages);
+ return ret;
+ }
+
+ nr_pages -= len;
+ if (!nr_pages)
+ break;
+
+ pteval = pte_advance_pfn(pteval, len);
+ address += len * PAGE_SIZE;
+ sub_batch_idx += len;
+ ptep += len;
+ }
+
return true;
}

@@ -2392,11 +2448,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}

if (!ttu_anon_folio(vma, folio, subpage, address,
- pvmw.pte, pteval)) {
- set_pte_at(mm, address, pvmw.pte, pteval);
+ pvmw.pte, pteval, nr_pages)) {
goto walk_abort;
}
- goto finish_unmap;
+ continue;
} else {
/*
* This is a locked file-backed folio,
@@ -2412,10 +2467,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
add_mm_counter(mm, mm_counter_file(folio), -nr_pages);
}
finish_unmap:
- folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
- if (vma->vm_flags & VM_LOCKED)
- mlock_drain_local();
- folio_put_refs(folio, nr_pages);
+ finish_folio_unmap(vma, folio, subpage, nr_pages);

/*
* If we are sure that we batched the entire folio and cleared
--
2.34.1