[PATCH 09/11] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
From: Joshua Hahn
Date: Wed Mar 11 2026 - 15:54:05 EST
Now that memcg charging happens in the zsmalloc layer where we have both
objcg and page information, we can specify which node's memcg lruvec
zswapped memory should be accounted to.
Move MEMCG_ZSWAP_B and MEMCG_ZSWAPPED_B from enum memcg_stat_item to
enum node_stat_item. Reanme their prefixes from MEMCG to NR to reflect
this move as well.
In addition, decouple the updates of node stats (vmstat) and
memcg-lruvec stats, since node stats can only track values at a
PAGE_SIZE granularity.
As a result of tracking zswap statistics at a finer granularity, the
charging from zsmalloc also gets more complicated to cover the cases
when the compressed object spans two zpdescs, which both live on
different nodes. In this case, the memcg-lruvec of both node-memcg
combinations are partially charged.
memcg-lruvec stats are now updated precisely and proportionally when
compressed objects are split across pages. Unfortunately for node stats,
only NR_ZSWAP_B can be kept accurate. NR_ZSWAPPED_B works as a good
best-effort value, but cannot proportionally account for compressed
objects split across nodes due to the coarse PAGE_SIZE granularity of
node stats. For such objects, NR_ZSWAPPED_B is accounted to the first
zpdesc's node stats.
Note that this is not a new inaccuracy, but one that is simply left
unable to be fixed as part of these changes. The small inaccuracy is
accepted in place of invasive changes across all of vmstat
infrastructure to begin tracking stats at byte granularity.
Finally, note that handling of objcg migrations across zspages (and
their subsequent migrations across nodes) are handled in the next patch.
Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
include/linux/memcontrol.h | 5 +-
include/linux/mmzone.h | 2 +
include/linux/zsmalloc.h | 6 +--
mm/memcontrol.c | 22 ++++----
mm/vmstat.c | 2 +
mm/zsmalloc.c | 104 +++++++++++++++++++++++++++----------
mm/zswap.c | 7 ++-
7 files changed, 102 insertions(+), 46 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ce2e598b5963..b03501e0c09b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -37,8 +37,6 @@ enum memcg_stat_item {
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
- MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B,
MEMCG_NR_STAT,
};
@@ -927,6 +925,9 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
+void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
void mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx, int val);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..ae16a90491ac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -258,6 +258,8 @@ enum node_stat_item {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_ZSWAP_B,
+ NR_ZSWAPPED_B,
NR_BALLOON_PAGES,
NR_KERNEL_FILE_PAGES,
NR_VM_NODE_STAT_ITEMS
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 6010d8dac9ff..fd79916c7740 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -24,11 +24,11 @@ struct zs_pool_stats {
struct zs_pool;
struct scatterlist;
struct obj_cgroup;
-enum memcg_stat_item;
+enum node_stat_item;
struct zs_pool *zs_create_pool(const char *name, bool memcg_aware,
- enum memcg_stat_item compressed_stat,
- enum memcg_stat_item uncompressed_stat);
+ enum node_stat_item compressed_stat,
+ enum node_stat_item uncompressed_stat);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1cb02d2febe8..d87bc4beff16 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -333,6 +333,8 @@ static const unsigned int memcg_node_stat_items[] = {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_ZSWAP_B,
+ NR_ZSWAPPED_B,
};
static const unsigned int memcg_stat_items[] = {
@@ -341,8 +343,6 @@ static const unsigned int memcg_stat_items[] = {
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
- MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B,
};
#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
@@ -737,9 +737,8 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
}
#endif
-static void mod_memcg_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx,
- int val)
+void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
@@ -766,6 +765,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
put_cpu();
}
+EXPORT_SYMBOL(mod_memcg_lruvec_state);
/**
* mod_lruvec_state - update lruvec memory statistics
@@ -1363,8 +1363,8 @@ static const struct memory_stat memory_stats[] = {
{ "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
#ifdef CONFIG_ZSWAP
- { "zswap", MEMCG_ZSWAP_B },
- { "zswapped", MEMCG_ZSWAPPED_B },
+ { "zswap", NR_ZSWAP_B },
+ { "zswapped", NR_ZSWAPPED_B },
#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
@@ -1411,8 +1411,8 @@ static int memcg_page_state_unit(int item)
{
switch (item) {
case MEMCG_PERCPU_B:
- case MEMCG_ZSWAP_B:
- case MEMCG_ZSWAPPED_B:
+ case NR_ZSWAP_B:
+ case NR_ZSWAPPED_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
return 1;
@@ -5482,7 +5482,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
/* Force flush to get accurate stats for charging */
__mem_cgroup_flush_stats(memcg, true);
- pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ pages = memcg_page_state(memcg, NR_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
ret = false;
@@ -5511,7 +5511,7 @@ static u64 zswap_current_read(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
mem_cgroup_flush_stats(memcg);
- return memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ return memcg_page_state(memcg, NR_ZSWAP_B);
}
static int zswap_max_show(struct seq_file *m, void *v)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 86b14b0f77b5..389ff986ceac 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1279,6 +1279,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_HUGETLB_PAGE
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
+ [I(NR_ZSWAP_B)] = "zswap",
+ [I(NR_ZSWAPPED_B)] = "zswapped",
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
[I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
#undef I
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 24665d7cd4a9..ab085961b0e2 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -216,8 +216,8 @@ struct zs_pool {
struct work_struct free_work;
#endif
bool memcg_aware;
- enum memcg_stat_item compressed_stat;
- enum memcg_stat_item uncompressed_stat;
+ enum node_stat_item compressed_stat;
+ enum node_stat_item uncompressed_stat;
/* protect zspage migration/compaction */
rwlock_t lock;
atomic_t compaction_in_progress;
@@ -823,6 +823,9 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
reset_zpdesc(zpdesc);
zpdesc_unlock(zpdesc);
zpdesc_dec_zone_page_state(zpdesc);
+ if (pool->memcg_aware)
+ dec_node_page_state(zpdesc_page(zpdesc),
+ pool->compressed_stat);
zpdesc_put(zpdesc);
zpdesc = next;
} while (zpdesc != NULL);
@@ -974,6 +977,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
__zpdesc_set_zsmalloc(zpdesc);
zpdesc_inc_zone_page_state(zpdesc);
+ if (pool->memcg_aware)
+ inc_node_page_state(zpdesc_page(zpdesc),
+ pool->compressed_stat);
zpdescs[i] = zpdesc;
}
@@ -985,6 +991,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
err:
while (--i >= 0) {
zpdesc_dec_zone_page_state(zpdescs[i]);
+ if (pool->memcg_aware)
+ dec_node_page_state(zpdesc_page(zpdescs[i]),
+ pool->compressed_stat);
free_zpdesc(zpdescs[i]);
}
if (pool->memcg_aware)
@@ -1029,10 +1038,48 @@ static bool zspage_empty(struct zspage *zspage)
}
#ifdef CONFIG_MEMCG
-static void zs_charge_objcg(struct zs_pool *pool, struct obj_cgroup *objcg,
- int size)
+static void __zs_mod_memcg_lruvec(struct zs_pool *pool, struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ int sign, unsigned long offset)
{
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ int compressed_size = size, original_size = PAGE_SIZE;
+ int nid = page_to_nid(zpdesc_page(zpdesc));
+ int next_nid = nid;
+
+ if (offset + size > PAGE_SIZE) {
+ struct zpdesc *next_zpdesc = get_next_zpdesc(zpdesc);
+
+ next_nid = page_to_nid(zpdesc_page(next_zpdesc));
+ if (nid != next_nid) {
+ compressed_size = PAGE_SIZE - offset;
+ original_size = (PAGE_SIZE * compressed_size) / size;
+ }
+ }
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ mod_memcg_lruvec_state(lruvec, pool->compressed_stat,
+ sign * compressed_size);
+ mod_memcg_lruvec_state(lruvec, pool->uncompressed_stat,
+ sign * original_size);
+
+ if (nid != next_nid) {
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(next_nid));
+ mod_memcg_lruvec_state(lruvec, pool->compressed_stat,
+ sign * (size - compressed_size));
+ mod_memcg_lruvec_state(lruvec, pool->uncompressed_stat,
+ sign * (PAGE_SIZE - original_size));
+ }
+ rcu_read_unlock();
+}
+
+static void zs_charge_objcg(struct zs_pool *pool, struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ unsigned long offset)
+{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
@@ -1044,18 +1091,19 @@ static void zs_charge_objcg(struct zs_pool *pool, struct obj_cgroup *objcg,
if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
VM_WARN_ON_ONCE(1);
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, pool->compressed_stat, size);
- mod_memcg_state(memcg, pool->uncompressed_stat, PAGE_SIZE);
- rcu_read_unlock();
+ __zs_mod_memcg_lruvec(pool, zpdesc, objcg, size, 1, offset);
+
+ /*
+ * Node-level vmstats are charged in PAGE_SIZE units. As a best-effort,
+ * always charge the uncompressed stats to the first zpdesc.
+ */
+ inc_node_page_state(zpdesc_page(zpdesc), pool->uncompressed_stat);
}
-static void zs_uncharge_objcg(struct zs_pool *pool, struct obj_cgroup *objcg,
- int size)
+static void zs_uncharge_objcg(struct zs_pool *pool, struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ unsigned long offset)
{
- struct mem_cgroup *memcg;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
@@ -1063,20 +1111,24 @@ static void zs_uncharge_objcg(struct zs_pool *pool, struct obj_cgroup *objcg,
obj_cgroup_uncharge(objcg, size);
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, pool->compressed_stat, -size);
- mod_memcg_state(memcg, pool->uncompressed_stat, -(int)PAGE_SIZE);
- rcu_read_unlock();
+ __zs_mod_memcg_lruvec(pool, zpdesc, objcg, size, -1, offset);
+
+ /*
+ * Node-level vmstats are charged in PAGE_SIZE units. As a best-effort,
+ * always uncharged the uncompressed stats from the first zpdesc.
+ */
+ dec_node_page_state(zpdesc_page(zpdesc), pool->uncompressed_stat);
}
#else
-static void zs_charge_objcg(struct zs_pool *pool, struct obj_cgroup *objcg,
- int size)
+static void zs_charge_objcg(struct zs_pool *pool, struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ unsigned long offset)
{
}
-static void zs_uncharge_objcg(struct zs_pool *pool, struct obj_cgroup *objcg,
- int size)
+static void zs_uncharge_objcg(struct zs_pool *pool, struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ unsigned long offset)
{
}
#endif
@@ -1298,7 +1350,7 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle,
WARN_ON_ONCE(!pool->memcg_aware);
zspage->objcgs[obj_idx] = objcg;
obj_cgroup_get(objcg);
- zs_charge_objcg(pool, objcg, class->size);
+ zs_charge_objcg(pool, zpdesc, objcg, class->size, off);
}
if (!ZsHugePage(zspage))
@@ -1477,7 +1529,7 @@ static void obj_free(int class_size, unsigned long obj)
if (pool->memcg_aware && zspage->objcgs[f_objidx]) {
struct obj_cgroup *objcg = zspage->objcgs[f_objidx];
- zs_uncharge_objcg(pool, objcg, class_size);
+ zs_uncharge_objcg(pool, f_zpdesc, objcg, class_size, f_offset);
obj_cgroup_put(objcg);
zspage->objcgs[f_objidx] = NULL;
}
@@ -2191,8 +2243,8 @@ static int calculate_zspage_chain_size(int class_size)
* otherwise NULL.
*/
struct zs_pool *zs_create_pool(const char *name, bool memcg_aware,
- enum memcg_stat_item compressed_stat,
- enum memcg_stat_item uncompressed_stat)
+ enum node_stat_item compressed_stat,
+ enum node_stat_item uncompressed_stat)
{
int i;
struct zs_pool *pool;
diff --git a/mm/zswap.c b/mm/zswap.c
index d81e2db4490b..2e9352b46693 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -256,8 +256,7 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
- pool->zs_pool = zs_create_pool(name, true, MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B);
+ pool->zs_pool = zs_create_pool(name, true, NR_ZSWAP_B, NR_ZSWAPPED_B);
if (!pool->zs_pool)
goto error;
@@ -1214,9 +1213,9 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
*/
if (!mem_cgroup_disabled()) {
mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ nr_backing = memcg_page_state(memcg, NR_ZSWAP_B);
nr_backing >>= PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED_B);
+ nr_stored = memcg_page_state(memcg, NR_ZSWAPPED_B);
nr_stored >>= PAGE_SHIFT;
} else {
nr_backing = zswap_total_pages();
--
2.52.0