[PATCH] mm: process_mrelease: skip LRU movement and expedite clean file folio reclaim

From: Minchan Kim

Date: Tue Apr 28 2026 - 19:39:06 EST


During process_mrelease(), unmapping file-backed folios spends a
significant portion of CPU time in folio_mark_accessed() to maintain
accurate LRU state (~55% of unmap time). Furthermore, clean file folios
are left in the page cache, delaying their reclamation.

This patch bundles two optimizations targeting exclusive file folios
during such emergency memory reclaim:
1. Skips LRU handling for exclusive file folios in zap_present_folio_ptes()
to save CPU time.
2. Proactively evicts clean file folios in free_pages_and_caches() to free
up memory quickly.

To avoid polluting mm/swap_state.c (which deals purely with swap) with file
cache eviction logic, we replace free_pages_and_swap_cache() with a more
generic free_pages_and_caches() in mm/swap.c where the file eviction helper
is defined.

[Side Effects & Rationale]
A concern raised during discussion is that skipping LRU handling effectively
breaks aging for exclusively mapped file pages that might have otherwise been
activated. This implies that if the victim process is immediately restarted,
it may suffer from cache misses and refaults because its files were not
preserved in the active state.

It was also noted during discussion that if avoiding LRU movement and
evicting cache negatively affects the workflow (due to immediate restart), it
implies a sub-optimal kill target selection by the userspace policy (e.g.,
LMKD), rather than a flaw in this expedited reclamation mechanism. Userspace
is expected to use this targeted reclaim API for victims that are not
expected to restart soon.

One might argue that standard memory reclaim by kswapd under heavy pressure
would have already encountered these pages and aged them accordingly. However,
this may not always be the case and ultimately depends on timing. We cannot
reliably predict whether kswapd has processed them, nor can we know the future
access patterns of a dying process.

Therefore, process_mrelease() is an emergency operation triggered under
extreme memory pressure. In these scenarios, recovering memory as quickly as
possible is the highest priority to avoid further kills or system jank.
Spending half of the unmap time on LRU maintenance for pages belonging to a
victim process is a bad trade-off. We prioritize immediate CPU savings and
faster memory recovery over potential future cache hits for the victim's files.

Fundamentally, anonymous pages are already freed immediately during process
termination because they cannot be preserved. This patch makes the behavior for
exclusive file-backed pages consistent by proactively evicting them, completing
a symmetric and expedited reclamation path for both types of memory.

To handle shared address spaces safely and protect existing users from behavior
changes, these optimizations are restricted to only when the new flag
PROCESS_MRELEASE_REAP_KILL is used. We pass the signal down via
zap_details->ignore_access for the unmap path, and via a new flag on struct
mmu_gather for the freeing path.

This provides deterministic and synchronous memory recovery that userspace
killers like Android's LMKD rely on, while protecting normal workloads from
broken aging.

Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
include/asm-generic/tlb.h | 2 ++
include/linux/mm.h | 1 +
include/linux/swap.h | 4 +++-
mm/memory.c | 13 +++++++++++-
mm/mmu_gather.c | 4 +++-
mm/oom_kill.c | 10 ++++++----
mm/swap.c | 42 +++++++++++++++++++++++++++++++++++++++
mm/swap_state.c | 24 ----------------------
8 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4aeac0c3d3f0..e0c7d3719c18 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -339,6 +339,8 @@ struct mmu_gather {
*/
unsigned int need_flush_all : 1;

+ unsigned int try_evict_file_folios : 1;
+
/*
* we have removed page directories
*/
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..6f41f42d549a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2800,6 +2800,7 @@ struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
bool reclaim_pt; /* Need reclaim page tables? */
+ bool ignore_access; /* Skip folio_mark_accessed during unmap */
zap_flags_t zap_flags; /* Extra flags for zapping */
};

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62fc7499b408..66435fe26b0f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -414,7 +414,10 @@ extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#endif

+struct mm_struct;
void check_move_unevictable_folios(struct folio_batch *fbatch);
+void free_pages_and_caches(struct encoded_page **pages, int nr,
+ bool try_evict_file_folios);

extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);
@@ -433,7 +436,6 @@ static inline unsigned long total_swapcache_pages(void)

void free_swap_cache(struct folio *folio);
void free_folio_and_swap_cache(struct folio *folio);
-void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..4f41a88c5259 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1640,6 +1640,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
bool delay_rmap = false;

if (!folio_test_anon(folio)) {
+ bool skip_mark_accessed;
+
ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
if (pte_dirty(ptent)) {
folio_mark_dirty(folio);
@@ -1648,7 +1650,16 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
*force_flush = true;
}
}
- if (pte_young(ptent) && likely(vma_has_recency(vma)))
+
+ /*
+ * For the process_mrelease reclaim, skip LRU handling for exclusive
+ * file-backed folios since they will be freed soon so pointless
+ * to move around in the LRU.
+ */
+ skip_mark_accessed = details && details->ignore_access &&
+ !folio_maybe_mapped_shared(folio);
+ if (likely(!skip_mark_accessed) && pte_young(ptent) &&
+ likely(vma_has_recency(vma)))
folio_mark_accessed(folio);
rss[mm_counter(folio)] -= nr;
} else {
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index fe5b6a031717..6b737d179e9e 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -146,9 +146,10 @@ static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
+ bool try_evict = tlb->try_evict_file_folios;

for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
- __tlb_batch_free_encoded_pages(batch);
+ __tlb_batch_free_encoded_pages(batch, try_evict);
tlb->active = &tlb->local;
}

@@ -410,6 +411,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
{
tlb->mm = mm;
tlb->fullmm = fullmm;
+ tlb->try_evict_file_folios = 0;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index efa6541b1c47..092e09498996 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,7 +514,7 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);

-static bool __oom_reap_task_mm(struct mm_struct *mm)
+static bool __oom_reap_task_mm(struct mm_struct *mm, bool try_evict_file_folios)
{
struct vm_area_struct *vma;
bool ret = true;
@@ -556,12 +556,14 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
mm, vma->vm_start,
vma->vm_end);
tlb_gather_mmu(&tlb, mm);
+ tlb.try_evict_file_folios = try_evict_file_folios;
+ struct zap_details details = { .ignore_access = try_evict_file_folios };
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
tlb_finish_mmu(&tlb);
ret = false;
continue;
}
- unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+ unmap_page_range(&tlb, vma, range.start, range.end, &details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -599,7 +601,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
trace_start_task_reaping(tsk->pid);

/* failed to reap part of the address space. Try again later */
- ret = __oom_reap_task_mm(mm);
+ ret = __oom_reap_task_mm(mm, false);
if (!ret)
goto out_finish;

@@ -1271,7 +1273,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
- if (!mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm, reap_kill))
ret = -EAGAIN;
mmap_read_unlock(mm);

diff --git a/mm/swap.c b/mm/swap.c
index bb19ccbece46..7a6d3fb924aa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
}
EXPORT_SYMBOL(release_pages);

+static inline void free_file_cache(struct folio *folio)
+{
+ if (folio_trylock(folio)) {
+ mapping_evict_folio(folio->mapping, folio);
+ folio_unlock(folio);
+ }
+}
+
+/*
+ * Passed an array of pages, drop them all from swapcache and then release
+ * them. They are removed from the LRU and freed if this is their last use.
+ *
+ * If @try_evict_file_folios is true, this function will proactively evict clean
+ * file-backed folios if they are no longer mapped.
+ */
+void free_pages_and_caches(struct encoded_page **pages, int nr,
+ bool try_evict_file_folios)
+{
+ struct folio_batch folios;
+ unsigned int refs[PAGEVEC_SIZE];
+
+ folio_batch_init(&folios);
+ for (int i = 0; i < nr; i++) {
+ struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
+
+ if (folio_test_anon(folio))
+ free_swap_cache(folio);
+ else if (unlikely(try_evict_file_folios))
+ free_file_cache(folio);
+
+ refs[folios.nr] = 1;
+ if (unlikely(encoded_page_flags(pages[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ refs[folios.nr] = encoded_nr_pages(pages[++i]);
+
+ if (folio_batch_add(&folios, folio) == 0)
+ folios_put_refs(&folios, refs);
+ }
+ if (folios.nr)
+ folios_put_refs(&folios, refs);
+}
+
/*
* The folios which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d0eef7470be..d948c1017d35 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -400,31 +400,7 @@ void free_folio_and_swap_cache(struct folio *folio)
folio_put(folio);
}

-/*
- * Passed an array of pages, drop them all from swapcache and then release
- * them. They are removed from the LRU and freed if this is their last use.
- */
-void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
-{
- struct folio_batch folios;
- unsigned int refs[PAGEVEC_SIZE];
-
- folio_batch_init(&folios);
- for (int i = 0; i < nr; i++) {
- struct folio *folio = page_folio(encoded_page_ptr(pages[i]));

- free_swap_cache(folio);
- refs[folios.nr] = 1;
- if (unlikely(encoded_page_flags(pages[i]) &
- ENCODED_PAGE_BIT_NR_PAGES_NEXT))
- refs[folios.nr] = encoded_nr_pages(pages[++i]);
-
- if (folio_batch_add(&folios, folio) == 0)
- folios_put_refs(&folios, refs);
- }
- if (folios.nr)
- folios_put_refs(&folios, refs);
-}

static inline bool swap_use_vma_readahead(void)
{
--
2.54.0.545.g6539524ca2-goog