[PATCH 4/7 v3] mm/memcontrol: convert memcg to use page_counter_stock

From: Joshua Hahn

Date: Mon May 25 2026 - 15:06:24 EST

Now with all of the memcg_stock handling logic replicated in
page_counter_stock, switch memcg to use the page_counter_stock.

There are a few details that have changed:

First, the old special-casing for the !allow_spinning check to avoid
refilling and flushing of the old stock is removed. This special casing
was important previously, because refilling the stock could do a lot of
extra work by evicting one of 7 random victim memcgs in the percpu
memcg_stock slots. In the new per-counter design, refilling stock just adds
pages to the counter's own local cache without affecting other memcgs,
so the original reason for the special case no longer applies.

Also, we can now fail during page_counter_enable_stock(), if there is
not enough memory to allocate a percpu page_counter_stock. This failure
is rare and nonfatal; the system can continue to operate, with the page
counter working without stock and falling back to walking the hierarchy.

Finally, drain_all_stock is restructured to iterate CPUs in the outer
loop (rather than memcgs) to be able to schedule draining all memcgs
via a single work_on_cpu call. It reduces the number of synchronous
per-CPU work calls from O(memcgs * CPUs) to just O(CPUs).

Note that obj_stock remains untouched by these changes.

Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
mm/memcontrol.c | 78 +++++++++++++++++++++----------------------------
1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 368efc1455e35..952c6f7430395 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2260,6 +2260,17 @@ static void schedule_drain_work(int cpu, struct work_struct *work)
queue_work_on(cpu, memcg_wq, work);
}

+static long drain_stock_on_cpu(void *arg)
+{
+ struct mem_cgroup *root_memcg = arg;
+ struct mem_cgroup *memcg;
+
+ for_each_mem_cgroup_tree(memcg, root_memcg)
+ page_counter_drain_stock_local(&memcg->memory);
+
+ return 0;
+}
+
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
@@ -2271,28 +2282,16 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
return;
- /*
- * Notify other cpus that system-wide "drain" is running
- * We do not care about races with the cpu hotplug because cpu down
- * as well as workers from this path always operate on the local
- * per-cpu data. CPU up doesn't touch memcg_stock at all.
- */
+
+ for_each_online_cpu(cpu)
+ work_on_cpu(cpu, drain_stock_on_cpu, root_memcg);
+
+ /* Drain obj_stock on all online CPUs */
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);

- if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
- is_memcg_drain_needed(memcg_st, root_memcg) &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE,
- &memcg_st->flags)) {
- if (cpu == curcpu)
- drain_local_memcg_stock(&memcg_st->work);
- else
- schedule_drain_work(cpu, &memcg_st->work);
- }
-
if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
obj_stock_flush_required(obj_st, root_memcg) &&
!test_and_set_bit(FLUSHING_CACHED_CHARGE,
@@ -2309,9 +2308,13 @@ void drain_all_stock(struct mem_cgroup *root_memcg)

static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
+ struct mem_cgroup *memcg;
+
/* no need for the local lock */
drain_obj_stock(&per_cpu(obj_stock, cpu));
- drain_stock_fully(&per_cpu(memcg_stock, cpu));
+
+ for_each_mem_cgroup(memcg)
+ page_counter_drain_stock_cpu(&memcg->memory, cpu);

return 0;
}
@@ -2586,7 +2589,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
- unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
@@ -2599,31 +2601,19 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);

retry:
- if (consume_stock(memcg, nr_pages))
- return 0;
-
- if (!allow_spinning)
- /* Avoid the refill and flush of the older stock */
- batch = nr_pages;
-
reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
if (!do_memsw_account() ||
- page_counter_try_charge(&memcg->memsw, batch, &counter)) {
- if (page_counter_try_charge(&memcg->memory, batch, &counter))
+ page_counter_try_charge(&memcg->memsw, nr_pages, &counter)) {
+ if (page_counter_try_charge(&memcg->memory, nr_pages, &counter))
goto done_restock;
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, batch);
+ page_counter_uncharge(&memcg->memsw, nr_pages);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}

- if (batch > nr_pages) {
- batch = nr_pages;
- goto retry;
- }
-
/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
@@ -2720,9 +2710,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
return 0;

done_restock:
- if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
-
/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
@@ -2759,7 +2746,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* and distribute reclaim work and delay penalties
* based on how much each task is actually allocating.
*/
- current->memcg_nr_pages_over_high += batch;
+ current->memcg_nr_pages_over_high += nr_pages;
set_notify_resume(current);
break;
}
@@ -3064,7 +3051,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
- refill_stock(memcg, nr_pages);
+ memcg_uncharge(memcg, nr_pages);

css_put(&memcg->css);
}
@@ -4096,6 +4083,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)

static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ page_counter_free_stock(&memcg->memory);
+ page_counter_free_stock(&memcg->memsw);
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
@@ -4268,6 +4257,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
refcount_set(&memcg->id.ref, 1);
css_get(css);

+ /* failure is nonfatal, charges fall back to direct hierarchy */
+ page_counter_enable_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
+
/*
* Ensure mem_cgroup_from_private_id() works once we're fully online.
*
@@ -4330,6 +4322,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
lru_gen_offline_memcg(memcg);

drain_all_stock(memcg);
+ page_counter_disable_stock(&memcg->memory);

mem_cgroup_private_id_put(memcg, 1);
}
@@ -5524,7 +5517,7 @@ void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)

mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);

- refill_stock(memcg, nr_pages);
+ page_counter_uncharge(&memcg->memory, nr_pages);
}

void mem_cgroup_flush_workqueue(void)
@@ -5577,12 +5570,9 @@ int __init mem_cgroup_init(void)
memcg_wq = alloc_workqueue("memcg", WQ_PERCPU, 0);
WARN_ON(!memcg_wq);

- for_each_possible_cpu(cpu) {
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_memcg_stock);
+ for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
drain_local_obj_stock);
- }

memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
--
2.53.0-Meta