[PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
From: Joshua Hahn
Date: Thu Feb 26 2026 - 14:35:47 EST
Now that memcg charging happens in the zsmalloc layer where we have both
objcg and page information, we can specify which node's memcg lruvec
zswapped memory should be accounted to.
Move MEMCG_ZSWAP_B and MEMCG_ZSWAPPED_B from enum_node_stat_item to
int memcg_node_stat_items. Rename their prefix from MEMCG to NR to
reflect this move as well.
In addition, decouple the updates of node stats (vmstat) and
memcg-lruvec stats, since node stats can only track values at a
PAGE_SIZE granularity.
Finally, track the moving charges whenever a compressed object migrates
from one zspage to another.
memcg-lruvec stats are now updated precisely and proportionally when
compressed objects are split across pages. Unfortunately for node stats,
only NR_ZSWAP_B can be kept accurate. NR_ZSWAPPED_B works as a good
best-effort value, but cannot proportionally account for compressed
objects split across pages due to the coarse PAGE_SIZE granularity
of node stats. For such objects, NR_ZSWAPPED_B is accounted to the first
zpdesc's node stats.
Note that this is not a new inaccuracy, but one that is simply left
unable to be fixed as part of these changes. The small inaccuracy is
accepted in place of invasive changes across all of vmstat
infrastructure to begin tracking stats at byte granularity.
Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
include/linux/memcontrol.h | 5 +--
include/linux/mmzone.h | 2 ++
mm/memcontrol.c | 18 +++++-----
mm/vmstat.c | 2 ++
mm/zsmalloc.c | 72 ++++++++++++++++++++++++++++++--------
mm/zswap.c | 4 +--
6 files changed, 76 insertions(+), 27 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d3952c918fd4..ba97b86d9104 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -37,8 +37,6 @@ enum memcg_stat_item {
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
- MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B,
MEMCG_NR_STAT,
};
@@ -932,6 +930,9 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
void mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx, int val);
+void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
+
static inline void mod_memcg_page_state(struct page *page,
enum memcg_stat_item idx, int val)
{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..ae16a90491ac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -258,6 +258,8 @@ enum node_stat_item {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_ZSWAP_B,
+ NR_ZSWAPPED_B,
NR_BALLOON_PAGES,
NR_KERNEL_FILE_PAGES,
NR_VM_NODE_STAT_ITEMS
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b662902d4e03..dc7cfff97296 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -331,6 +331,8 @@ static const unsigned int memcg_node_stat_items[] = {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_ZSWAP_B,
+ NR_ZSWAPPED_B,
};
static const unsigned int memcg_stat_items[] = {
@@ -339,8 +341,6 @@ static const unsigned int memcg_stat_items[] = {
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
- MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B,
};
#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
@@ -726,7 +726,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
}
#endif
-static void mod_memcg_lruvec_state(struct lruvec *lruvec,
+void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
@@ -1344,8 +1344,8 @@ static const struct memory_stat memory_stats[] = {
{ "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
#ifdef CONFIG_ZSWAP
- { "zswap", MEMCG_ZSWAP_B },
- { "zswapped", MEMCG_ZSWAPPED_B },
+ { "zswap", NR_ZSWAP_B },
+ { "zswapped", NR_ZSWAPPED_B },
#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
@@ -1392,8 +1392,8 @@ static int memcg_page_state_unit(int item)
{
switch (item) {
case MEMCG_PERCPU_B:
- case MEMCG_ZSWAP_B:
- case MEMCG_ZSWAPPED_B:
+ case NR_ZSWAP_B:
+ case NR_ZSWAPPED_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
return 1;
@@ -5424,7 +5424,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
/* Force flush to get accurate stats for charging */
__mem_cgroup_flush_stats(memcg, true);
- pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ pages = memcg_page_state(memcg, NR_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
ret = false;
@@ -5453,7 +5453,7 @@ static u64 zswap_current_read(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
mem_cgroup_flush_stats(memcg);
- return memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ return memcg_page_state(memcg, NR_ZSWAP_B);
}
static int zswap_max_show(struct seq_file *m, void *v)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 99270713e0c1..4b10610bd999 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1279,6 +1279,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_HUGETLB_PAGE
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
+ [I(NR_ZSWAP_B)] = "zswap",
+ [I(NR_ZSWAPPED_B)] = "zswapped",
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
[I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
#undef I
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 6794927c60fb..548e7f4b8bf6 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -810,6 +810,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
struct zpdesc *zpdesc, *next;
+ bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
assert_spin_locked(&class->lock);
@@ -823,6 +824,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
reset_zpdesc(zpdesc);
zpdesc_unlock(zpdesc);
zpdesc_dec_zone_page_state(zpdesc);
+ if (objcg)
+ dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B);
zpdesc_put(zpdesc);
zpdesc = next;
} while (zpdesc != NULL);
@@ -963,11 +966,45 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
return true;
}
-static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
- int size, unsigned long offset)
+static void __zs_mod_memcg_lruvec(struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ int sign, unsigned long offset)
{
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ int compressed_size = size, original_size = PAGE_SIZE;
+ int nid = page_to_nid(zpdesc_page(zpdesc));
+ int next_nid = nid;
+
+ if (offset + size > PAGE_SIZE) {
+ struct zpdesc *next_zpdesc = get_next_zpdesc(zpdesc);
+
+ next_nid = page_to_nid(zpdesc_page(next_zpdesc));
+ if (nid != next_nid) {
+ compressed_size = PAGE_SIZE - offset;
+ original_size = (PAGE_SIZE * compressed_size) / size;
+ }
+ }
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAP_B, sign * compressed_size);
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAPPED_B, sign * original_size);
+
+ if (nid != next_nid) {
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(next_nid));
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAP_B,
+ sign * (size - compressed_size));
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAPPED_B,
+ sign * (PAGE_SIZE - original_size));
+ }
+ rcu_read_unlock();
+}
+static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+ int size, unsigned long offset)
+{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
@@ -977,28 +1014,30 @@ static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
VM_WARN_ON_ONCE(1);
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, 1);
- rcu_read_unlock();
+ __zs_mod_memcg_lruvec(zpdesc, objcg, size, 1, offset);
+
+ /*
+ * Node-level vmstats are charged in PAGE_SIZE units. As a
+ * best-effort, always charge NR_ZSWAPPED_B to the first zpdesc.
+ */
+ inc_node_page_state(zpdesc_page(zpdesc), NR_ZSWAPPED_B);
}
static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
int size, unsigned long offset)
{
- struct mem_cgroup *memcg;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
obj_cgroup_uncharge(objcg, size);
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, -1);
- rcu_read_unlock();
+ __zs_mod_memcg_lruvec(zpdesc, objcg, size, -1, offset);
+
+ /*
+ * Node-level vmstats are uncharged in PAGE_SIZE units. As a
+ * best-effort, always uncharge NR_ZSWAPPED_B to the first zpdesc.
+ */
+ dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAPPED_B);
}
static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj,
@@ -1135,6 +1174,8 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
__zpdesc_set_zsmalloc(zpdesc);
zpdesc_inc_zone_page_state(zpdesc);
+ if (objcg)
+ inc_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B);
zpdescs[i] = zpdesc;
}
@@ -1149,6 +1190,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
err:
while (--i >= 0) {
zpdesc_dec_zone_page_state(zpdescs[i]);
+ if (objcg)
+ dec_node_page_state(zpdesc_page(zpdescs[i]),
+ NR_ZSWAP_B);
free_zpdesc(zpdescs[i]);
}
cache_free_zspage(zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index 97f38d0afa86..9e845e1d7214 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1214,9 +1214,9 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
*/
if (!mem_cgroup_disabled()) {
mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ nr_backing = memcg_page_state(memcg, NR_ZSWAP_B);
nr_backing >>= PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED_B);
+ nr_stored = memcg_page_state(memcg, NR_ZSWAPPED_B);
nr_stored >>= PAGE_SHIFT;
} else {
nr_backing = zswap_total_pages();
--
2.47.3