[PATCH RFC 26/32] mm/workingset: properly define the format of a folio shadow
From: Kairui Song via B4 Relay
Date: Fri May 01 2026 - 17:07:01 EST
From: Kairui Song <kasong@xxxxxxxxxxx>
The shadow of an evicted folio can be roughly divided into two parts:
- The common and mandatory pack info: which contains the memcg info,
workingset bit, and pgdat.
- LRU specific eviction info: which is a "timestamp" for Active/Inactive
LRU, and generation sequence for MGLRU.
The common pack part is the same for both Active/Inactive and MGLRU, and
the data stored presents the exact information. Meanwhile, the eviction
info part could be truncated, which is OK since the eviction info is
just a hint for LRU to determine what to do with a refaulted folio, and
in the worst case, only has a limited effect on the system's
performance.
Add some comments on this, and consolidate the macros for these two
parts.
Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
mm/workingset.c | 61 +++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 40 insertions(+), 21 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c
index b472ac34943e..622e00ac28b6 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -184,13 +184,35 @@
* refault distance will immediately activate the refaulting page.
*/
-#define WORKINGSET_SHIFT 1
-#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- WORKINGSET_SHIFT + NODES_SHIFT + \
- MEM_CGROUP_ID_SHIFT)
-#define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT)
-#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
-#define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON)
+/*
+ * Active/Inactive LRU, MGLRU have different info embedded in the shadow.
+ * Shadow format:
+ * / LRU Eviction Info \ / LRU Pack Info \
+ * +----------------------------+----------------+-+
+ * non-MGLRU: |SC| eviction timestamp | NID | MCID | W |1|
+ * MGLRU: |SC| seq number | refs | NID | MCID | W |1|
+ * ^ ^ ^ ^ ^
+ * Swap Count (anon only) NUMA ID (NODES_SHIFT)-+ | | XA_VALUE
+ * Memory Cgroup ID (MEM_CGROUP_ID_SHIFT) --------+ | mark
+ * Workingset Bit (WORKINGSET_SHIFT) --------+
+ *
+ * Shadow is a XA_VALUE, 63 / 31 bits are usable.
+ *
+ * The LRU pack info part is used to identify which lruvec a folio was
+ * evicted from. This part is always accurate so we never lose the
+ * basic track of faults on each lruvec.
+ *
+ * Eviction info is either a snapshot of the `evictions` counter of an
+ * lruvec when the folio was evicted (lru timestamp, for active/inactive
+ * LRU), or the min_seq number when the folio was evicted (MGLRU). This
+ * part may have shrunk, so we may get inaccurate info, which is usually
+ * fine and could be tolerated.
+ */
+#define WORKINGSET_SHIFT 1
+#define LRU_PACK_BITS (NODES_SHIFT + MEM_CGROUP_ID_SHIFT + \
+ WORKINGSET_SHIFT)
+#define LRU_EVICT_BITS (BITS_PER_XA_VALUE - LRU_PACK_BITS)
+#define LRU_EVICT_BITS_ANON (LRU_EVICT_BITS - SWAP_COUNT_SHIFT)
/*
* LRU refs uses LRU_REFS_WIDTH + 2 bits, the 2 bits are PG_workingset and
@@ -212,7 +234,9 @@ static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset, bool file)
{
- eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
+ BUILD_BUG_ON(LRU_EVICT_BITS_ANON <= SWAP_COUNT_SHIFT);
+
+ eviction &= BIT(file ? LRU_EVICT_BITS : LRU_EVICT_BITS_ANON) - 1;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << WORKINGSET_SHIFT) | workingset;
@@ -257,8 +281,7 @@ static void *lru_gen_eviction(struct folio *folio)
struct pglist_data *pgdat = folio_pgdat(folio);
unsigned short memcg_id;
- BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_BITS >
- BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_BITS > LRU_EVICT_BITS_ANON);
rcu_read_lock();
memcg = folio_memcg(folio);
@@ -284,7 +307,7 @@ static bool lru_gen_test_recent(struct lruvec *lruvec,
unsigned long max_seq;
max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
- max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_BITS;
+ max_seq &= BIT((file ? LRU_EVICT_BITS : LRU_EVICT_BITS_ANON) - LRU_REFS_BITS) - 1;
return abs_diff(max_seq, token >> LRU_REFS_BITS) < MAX_NR_GENS;
}
@@ -512,7 +535,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
*/
eviction <<= bucket_order[file];
distance = ((refault - eviction) &
- (file ? EVICTION_MASK : EVICTION_MASK_ANON));
+ (BIT(file ? LRU_EVICT_BITS : LRU_EVICT_BITS_ANON) - 1));
/*
* Compare the distance to the existing workingset size. We
@@ -781,12 +804,10 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
- unsigned int timestamp_bits, timestamp_bits_anon;
struct shrinker *workingset_shadow_shrinker;
unsigned int max_order;
int ret = -ENOMEM;
- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
/*
* Calculate the eviction bucket size to cover the longest
* actionable refault distance, which is currently half of
@@ -794,15 +815,13 @@ static int __init workingset_init(void)
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
- timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
max_order = fls_long(totalram_pages() - 1);
- if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
- bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
- if (max_order > timestamp_bits_anon)
- bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
+ if (max_order > LRU_EVICT_BITS)
+ bucket_order[WORKINGSET_FILE] = max_order - LRU_EVICT_BITS;
+ if (max_order > LRU_EVICT_BITS_ANON)
+ bucket_order[WORKINGSET_ANON] = max_order - LRU_EVICT_BITS_ANON;
pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
- timestamp_bits, timestamp_bits_anon, max_order,
+ LRU_EVICT_BITS, LRU_EVICT_BITS_ANON, max_order,
bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
--
2.54.0