Re: [PATCH v5 29/32] mm: memcontrol: prepare for reparenting non-hierarchical stats

From: Qi Zheng

Date: Thu Feb 26 2026 - 22:12:32 EST


Hi Yosry,

On 2/26/26 11:16 PM, Yosry Ahmed wrote:
Did you measure the impact of making state_local atomic on the flush
path? It's a slow path but we've seen pain from it being too slow
before, because it extends the critical section of the rstat flush
lock.

Qi, please measure the impact on flushing and if no impact then no need to do
anything as I don't want anymore churn in this series.


Can we keep this non-atomic and use mod_memcg_lruvec_state() here? It
will update the stat on the local counter and it will be added to
state_local in the flush path when needed. We can even force another
flush in reparent_state_local () after reparenting is completed, if we
want to avoid leaving a potentially large stat update pending, as it
can be missed by mem_cgroup_flush_stats_ratelimited().

Same for reparent_memcg_state_local(), we can probably use mod_memcg_state()?

Yosry, do you mind sending the patch you are thinking about over this series?

Honestly, I'd rather squash it into this patch if possible. It avoids
churn in the history (switch to atomics and back), and is arguably
simpler than checking for regressions in the flush path.

What I have in mind is the diff below (build tested only). Qi, would
you be able to test this? It applies directly on this patch in mm-new:

Thank you so much for doing this! I'm willing to squash and test.

And I found some issues with the diff:


diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d82dbfcc28057..404565e80cbf3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -234,11 +234,18 @@ static inline void reparent_state_local(struct
mem_cgroup *memcg, struct mem_cgr
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;

+ /*
+ * Reparent stats exposed non-hierarchically. Flush @memcg's
stats first to
+ * read its stats accurately , and conservatively flush @parent's stats
+ * after reparenting to avoid hiding a potentially large stat update
+ * (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+ */
__mem_cgroup_flush_stats(memcg, true);

- /* The following counts are all non-hierarchical and need to
be reparented. */
reparent_memcg1_state_local(memcg, parent);
reparent_memcg1_lruvec_state_local(memcg, parent);
+
+ __mem_cgroup_flush_stats(parent, true);
}
#else
static inline void reparent_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
@@ -442,7 +449,7 @@ struct lruvec_stats {
long state[NR_MEMCG_NODE_STAT_ITEMS];

/* Non-hierarchical (CPU aggregated) state */
- atomic_long_t state_local[NR_MEMCG_NODE_STAT_ITEMS];
+ long state_local[NR_MEMCG_NODE_STAT_ITEMS];

/* Pending child counts during tree propagation */
long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
@@ -485,7 +492,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return 0;

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = atomic_long_read(&(pn->lruvec_stats->state_local[i]));
+ x = READ_ONCE(pn->lruvec_stats->state_local[i]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -493,6 +500,10 @@ unsigned long lruvec_page_state_local(struct
lruvec *lruvec,
return x;
}

+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx,
+ int val);
+
#ifdef CONFIG_MEMCG_V1
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent, int idx)
@@ -506,12 +517,10 @@ void reparent_memcg_lruvec_state_local(struct
mem_cgroup *memcg,
for_each_node(nid) {
struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg,
NODE_DATA(nid));
struct lruvec *parent_lruvec =
mem_cgroup_lruvec(parent, NODE_DATA(nid));
- struct mem_cgroup_per_node *parent_pn;
unsigned long value =
lruvec_page_state_local(child_lruvec, idx);

- parent_pn = container_of(parent_lruvec, struct
mem_cgroup_per_node, lruvec);
-
- atomic_long_add(value,
&(parent_pn->lruvec_stats->state_local[i]));
+ mod_memcg_lruvec_state(child_lruvec, idx, -value);

We can't use mod_memcg_lruvec_state() here, because child memcg has
already been set CSS_DYING. So in mod_memcg_lruvec_state(), we will
get parent memcg.

It seems we need to reimplement a function or add a parameter to
mod_memcg_lruvec_state() to solve the problem. What do you think?

+ mod_memcg_lruvec_state(parent_lruvec, idx, value);
}
}
#endif
@@ -598,7 +607,7 @@ struct memcg_vmstats {
unsigned long events[NR_MEMCG_EVENTS];

/* Non-hierarchical (CPU aggregated) page state & events */
- atomic_long_t state_local[MEMCG_VMSTAT_SIZE];
+ long state_local[MEMCG_VMSTAT_SIZE];
unsigned long events_local[NR_MEMCG_EVENTS];

/* Pending child counts during tree propagation */
@@ -835,7 +844,7 @@ unsigned long memcg_page_state_local(struct
mem_cgroup *memcg, int idx)
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n",
__func__, idx))
return 0;

- x = atomic_long_read(&(memcg->vmstats->state_local[i]));
+ x = READ_ONCE(memcg->vmstats->state_local[i]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -852,7 +861,8 @@ void reparent_memcg_state_local(struct mem_cgroup *memcg,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n",
__func__, idx))
return;

- atomic_long_add(value, &(parent->vmstats->state_local[i]));
+ mod_memcg_state(memcg, idx, -value);

Same as mod_memcg_lruvec_state().

Thanks,
Qi

+ mod_memcg_state(parent, idx, value);
}
#endif

@@ -4174,8 +4184,6 @@ struct aggregate_control {
long *aggregate;
/* pointer to the non-hierarchichal (CPU aggregated) counters */
long *local;
- /* pointer to the atomic non-hierarchichal (CPU aggregated) counters */
- atomic_long_t *alocal;
/* pointer to the pending child counters during tree propagation */
long *pending;
/* pointer to the parent's pending counters, could be NULL */
@@ -4213,12 +4221,8 @@ static void mem_cgroup_stat_aggregate(struct
aggregate_control *ac)
}

/* Aggregate counts on this level and propagate upwards */
- if (delta_cpu) {
- if (ac->local)
- ac->local[i] += delta_cpu;
- else if (ac->alocal)
- atomic_long_add(delta_cpu, &(ac->alocal[i]));
- }
+ if (delta_cpu)
+ ac->local[i] += delta_cpu;

if (delta) {
ac->aggregate[i] += delta;
@@ -4289,8 +4293,7 @@ static void mem_cgroup_css_rstat_flush(struct
cgroup_subsys_state *css, int cpu)

ac = (struct aggregate_control) {
.aggregate = memcg->vmstats->state,
- .local = NULL,
- .alocal = memcg->vmstats->state_local,
+ .local = memcg->vmstats->state_local,
.pending = memcg->vmstats->state_pending,
.ppending = parent ? parent->vmstats->state_pending : NULL,
.cstat = statc->state,
@@ -4323,8 +4326,7 @@ static void mem_cgroup_css_rstat_flush(struct
cgroup_subsys_state *css, int cpu)

ac = (struct aggregate_control) {
.aggregate = lstats->state,
- .local = NULL,
- .alocal = lstats->state_local,
+ .local = lstats->state_local,
.pending = lstats->state_pending,
.ppending = plstats ? plstats->state_pending : NULL,
.cstat = lstatc->state,