[PATCH 5/5] mm/khugepaged: add khugepaged collapse hint in mglru reference checking

From: Luka Bai

Date: Sun May 31 2026 - 00:29:03 EST


From: Luka Bai <lukabai@xxxxxxxxxxx>

Function lru_gen_look_around() works for mglru, which is a good way
to reduce the rmap iteration. It is called in folio_referenced_one()
when it tried to reclaim a cold page. By the time it gets the page
table entry lock, it will also check the nearby ptes and try to
update their generation if they are also accessed because of locality
in most of workloads, and put the pmd that it thinks full of hot
pages into a Bloom filter, for the walk through in next aging.

Function walk_mm() is used in mglru during aging. It will go through
all the pmds of a mm_struct if certain pmd is set in the Bloom
filter, which is setup in lru_gen_look_around() above, and indicates
that pmd is frequently accessed in many pages.

Now that lru_gen_look_around() and walk_mm() found hot pmd area, we
can also use their findings as good sources of khugepaged collapse
hint, so we make up collapse hints from there.

Note that lru_gen_look_around() is called with ptl lock locked, so
we don't want to directly call khugepaged_add_collapse_hint() inside
it because it may try to allocate memory. So we introduced a new struct
area_access_info, and use it to get the access info from inside, and
do collapse after the ptl released.

Signed-off-by: Luka Bai <lukabai@xxxxxxxxxxx>
---
include/linux/khugepaged.h | 7 +++++++
include/linux/mmzone.h | 17 +++++++++++++++--
mm/khugepaged.c | 12 ++++++++++++
mm/rmap.c | 27 ++++++++++++++++++++++++++-
mm/vmscan.c | 33 +++++++++++++++++++++++++++++----
5 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 815ae87f0f8e..e0793569a9f0 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -17,6 +17,7 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
+extern int get_khp_collapse_priority(int total, int young);
extern void khugepaged_add_collapse_hint(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -62,6 +63,12 @@ static inline bool current_is_khugepaged(void)
{
return false;
}
+
+static inline int get_khp_collapse_priority(int total, int young)
+{
+ return 0;
+}
+
static inline void khugepaged_add_collapse_hint(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1331a7b93f33..643dd500c121 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -441,6 +441,18 @@ enum lruvec_flags {

#endif /* !__GENERATING_BOUNDS_H */

+/*
+ * Used to get the young and total counts for a memory area,
+ * and also the maximum order of all the page table entries
+ * during scanning.
+ */
+struct area_access_info {
+ unsigned long address;
+ int total;
+ int young;
+ int max_order;
+};
+
/*
* Evictable folios are divided into multiple generations. The youngest and the
* oldest generation numbers, max_seq and min_seq, are monotonically increasing.
@@ -689,7 +701,8 @@ struct lru_gen_memcg {

void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr,
+ struct area_access_info **acc_info_ptr);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -712,7 +725,7 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
}

static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw,
- unsigned int nr)
+ unsigned int nr, struct area_access_info **acc_info_ptr)
{
return false;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2f21c0b6ab46..50c363846720 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -3031,6 +3031,18 @@ static enum scan_result collapse_single_pmd(unsigned long addr,
return result;
}

+/*
+ * The caller needs to make sure the pmd is at least qualified for the
+ * lowest priority of collapsing since this function will always return
+ * a legal priority value.
+ */
+int get_khp_collapse_priority(int total, int young)
+{
+ if (young * 2 >= total)
+ return 0;
+ return NR_KHUGEPAGED_PRIORITY_LEVEL - 1;
+}
+
/*
* khugepaged_add_collapse_hint - enqueue a collapse hint
* @mm: target mm
diff --git a/mm/rmap.c b/mm/rmap.c
index 1c77d5dc06e9..1cd111e7b299 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -75,6 +75,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>
#include <linux/oom.h>
+#include <linux/khugepaged.h>

#include <asm/tlb.h>

@@ -911,6 +912,12 @@ struct folio_referenced_arg {
struct mem_cgroup *memcg;
};

+/*
+ * acc_info is currently only used to track access patterns for khugepaged
+ * collapse hints. 3 entries are enough for most cases, and it's totally
+ * safe if we missed some hints.
+ */
+#define NR_ACC_INFO_EACH_ITER 3
/*
* arg: folio_referenced_arg will be passed
*/
@@ -921,6 +928,8 @@ static bool folio_referenced_one(struct folio *folio,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int ptes = 0, referenced = 0;
unsigned int nr;
+ struct area_access_info acc_info[NR_ACC_INFO_EACH_ITER] = {0};
+ int acc_info_count = 0;

while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
@@ -979,8 +988,16 @@ static bool folio_referenced_one(struct folio *folio,
* simplest approach is to disable this look-around optimization.
*/
if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) {
- if (lru_gen_look_around(&pvmw, nr))
+ struct area_access_info *acc_info_ptr = NULL;
+
+ /* If the acc_info is full, skip the remaining ones */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ acc_info_count < NR_ACC_INFO_EACH_ITER)
+ acc_info_ptr = &acc_info[acc_info_count];
+ if (lru_gen_look_around(&pvmw, nr, &acc_info_ptr))
referenced++;
+ if (acc_info_ptr && acc_info_ptr != &acc_info[acc_info_count])
+ acc_info_count++;
} else if (pvmw.pte) {
if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
referenced++;
@@ -1019,6 +1036,14 @@ static bool folio_referenced_one(struct folio *folio,
pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
}

+ for (--acc_info_count; acc_info_count >= 0; acc_info_count--) {
+ khugepaged_add_collapse_hint(vma->vm_mm, vma,
+ acc_info[acc_info_count].address,
+ get_khp_collapse_priority(acc_info[acc_info_count].total,
+ acc_info[acc_info_count].young),
+ acc_info[acc_info_count].max_order);
+ }
+
if (!pra->mapcount)
return false; /* To break the loop */

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e8a90911bf88..a0caf5cac951 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3463,7 +3463,7 @@ static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
}

static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- struct mm_walk *args)
+ struct mm_walk *args, struct area_access_info *acc_info)
{
int i;
bool dirty;
@@ -3472,6 +3472,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
unsigned long addr;
int total = 0;
int young = 0;
+ int max_order = 0;
struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
@@ -3522,6 +3523,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
max_nr, FPB_MERGE_YOUNG_DIRTY);
total += nr - 1;
walk->mm_stats[MM_LEAF_TOTAL] += nr - 1;
+ max_order = max(max_order, folio_order(folio));
}

if (!test_and_clear_young_ptes_notify(args->vma, addr, cur_pte, nr))
@@ -3550,6 +3552,9 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
lazy_mmu_mode_disable();
pte_unmap_unlock(pte, ptl);

+ acc_info->young = young;
+ acc_info->max_order = max_order;
+ acc_info->total = total;
return suitable_to_scan(total, young);
}

@@ -3667,6 +3672,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
vma = args->vma;
for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
pmd_t val = pmdp_get_lockless(pmd + i);
+ struct area_access_info acc_info = {0};

next = pmd_addr_end(addr, end);

@@ -3699,11 +3705,16 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,

walk->mm_stats[MM_NONLEAF_FOUND]++;

- if (!walk_pte_range(&val, addr, next, args))
+ if (!walk_pte_range(&val, addr, next, args, &acc_info))
continue;

walk->mm_stats[MM_NONLEAF_ADDED]++;

+ /* When acc_info has valid value */
+ if (acc_info.total > 0)
+ khugepaged_add_collapse_hint(vma->vm_mm, vma, addr,
+ get_khp_collapse_priority(acc_info.total, acc_info.young),
+ acc_info.max_order);
/* carry over to the next generation */
update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
}
@@ -4183,7 +4194,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* the PTE table to the Bloom filter. This forms a feedback loop between the
* eviction and the aging.
*/
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr,
+ struct area_access_info **acc_info_ptr)
{
int i;
bool dirty;
@@ -4202,6 +4214,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
struct lru_gen_mm_state *mm_state;
unsigned long max_seq;
int gen;
+ unsigned int max_order = 0;

lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4265,6 +4278,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)

nr = folio_pte_batch_flags(folio, NULL, pte, &ptent,
max_nr, FPB_MERGE_YOUNG_DIRTY);
+ max_order = max(folio_order(folio), max_order);
}

if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr))
@@ -4288,8 +4302,19 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
lazy_mmu_mode_disable();

/* feedback from rmap walkers to page table walkers */
- if (mm_state && suitable_to_scan(i, young))
+ if (mm_state && suitable_to_scan(i, young)) {
+ if (*acc_info_ptr) {
+ struct area_access_info acc_info = {
+ .address = start,
+ .total = i,
+ .young = young,
+ .max_order = max_order
+ };
+ *(*acc_info_ptr) = acc_info;
+ (*acc_info_ptr)++;
+ }
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+ }

mem_cgroup_put(memcg);


--
2.52.0