[PATCH v5 update 29/32] mm: memcontrol: prepare for reparenting non-hierarchical stats
From: Qi Zheng
Date: Sat Feb 28 2026 - 02:26:35 EST
From: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg. This could cause problems for non-hierarchical
stats.
As Yosry Ahmed pointed out:
```
In short, if memory is charged to a dying cgroup at the time of
reparenting, when the memory gets uncharged the stats updates will occur
at the parent. This will update both hierarchical and non-hierarchical
stats of the parent, which would corrupt the parent's non-hierarchical
stats (because those counters were never incremented when the memory was
charged).
```
Now we have the following two types of non-hierarchical stats, and they
are only used in CONFIG_MEMCG_V1:
a. memcg->vmstats->state_local[i]
b. pn->lruvec_stats->state_local[i]
To ensure that these non-hierarchical stats work properly, we need to
reparent these non-hierarchical stats after reparenting LRU folios. To
this end, this commit makes the following preparations:
1. implement reparent_state_local() to reparent non-hierarchical stats
2. make css_killed_work_fn() to be called in rcu work, and implement
get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
between mod_memcg_state()/mod_memcg_lruvec_state()
and reparent_state_local()
Co-developed-by: Yosry Ahmed <yosry@xxxxxxxxxx>
Signed-off-by: Yosry Ahmed <yosry@xxxxxxxxxx>
Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
Acked-by: Shakeel Butt <shakeel.butt@xxxxxxxxx>
---
kernel/cgroup/cgroup.c | 8 +--
mm/memcontrol-v1.c | 16 +++++
mm/memcontrol-v1.h | 7 ++
mm/memcontrol.c | 146 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 173 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda3179..74344e3931743 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6044,8 +6044,8 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
+ struct cgroup_subsys_state, destroy_rwork);
cgroup_lock();
@@ -6066,8 +6066,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_offline_wq, &css->destroy_work);
+ INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+ queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
}
}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index fe42ef664f1e1..51fb4406f45cf 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1897,6 +1897,22 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+ reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ reparent_memcg_lruvec_state_local(memcg, parent, i);
+}
+
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 4041b5027a94b..05e6ff40f7556 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -77,6 +77,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5929e397c3c31..7b61bb663042b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -226,6 +226,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
return objcg;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ /*
+ * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
+ * to read its stats accurately , and conservatively flush @parent's
+ * stats after reparenting to avoid hiding a potentially large stat
+ * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+ */
+ __mem_cgroup_flush_stats(memcg, true);
+
+ /* The following counts are all non-hierarchical and need to be reparented. */
+ reparent_memcg1_state_local(memcg, parent);
+ reparent_memcg1_lruvec_state_local(memcg, parent);
+
+ __mem_cgroup_flush_stats(parent, true);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
spin_lock_irq(&objcg_lock);
@@ -473,6 +501,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val);
+
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int i = memcg_stats_index(idx);
+ int nid;
+
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
+ return;
+
+ for_each_node(nid) {
+ struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+
+ __mod_memcg_lruvec_state(child_lruvec, idx, -value);
+ __mod_memcg_lruvec_state(parent_lruvec, idx, value);
+ }
+}
+#endif
+
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -718,6 +770,42 @@ static int memcg_state_val_in_pages(int idx, int val)
return max(val * unit / PAGE_SIZE, 1UL);
}
+#ifdef CONFIG_MEMCG_V1
+/*
+ * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
+ * reparenting of non-hierarchical state_locals.
+ */
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return memcg;
+
+ rcu_read_lock();
+
+ while (memcg_is_dying(memcg))
+ memcg = parent_mem_cgroup(memcg);
+
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ rcu_read_unlock();
+}
+#else
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+}
+#endif
+
/**
* mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
@@ -763,6 +851,64 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+
+static void __mod_memcg_state(struct mem_cgroup *memcg,
+ enum memcg_stat_item idx, int val)
+{
+ int i = memcg_stats_index(idx);
+ int cpu;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ val = memcg_state_val_in_pages(idx, val);
+ memcg_rstat_updated(memcg, val, cpu);
+ trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
+}
+
+static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val)
+{
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ int i = memcg_stats_index(idx);
+ int cpu;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
+ cpu = get_cpu();
+
+ /* Update memcg */
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
+
+ /* Update lruvec */
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+
+ val = memcg_state_val_in_pages(idx, val);
+ memcg_rstat_updated(memcg, val, cpu);
+ trace_mod_memcg_lruvec_state(memcg, idx, val);
+
+ put_cpu();
+}
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int i = memcg_stats_index(idx);
+ unsigned long value = memcg_page_state_local(memcg, idx);
+
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
+ return;
+
+ __mod_memcg_state(memcg, idx, -value);
+ __mod_memcg_state(parent, idx, value);
+}
#endif
static void mod_memcg_lruvec_state(struct lruvec *lruvec,
--
2.20.1