[PATCH v4 1/5] mm/page_counter: introduce per-page_counter stock

From: Joshua Hahn

Date: Tue Jun 23 2026 - 14:01:36 EST


In order to avoid expensive hierarchy walks on every memcg charge and
limit check, memcontrol uses per-cpu stocks (memcg_stock_pcp) to cache
pre-charged pages and introduce a fast path to try_charge_memcg.
However, there are a few quirks with the current implementation that
could be improved upon.

First, each memcg_stock_pcp can only cache the charges of 7 memcgs
(defined as NR_MEMCG_STOCK), which means that once a CPU starts handling
the charging of more than 7 memcgs, it randomly selects a victim memcg
to evict and drain from the cpu, which can cause unnecessarily increased
latencies and thrashing as memcgs continually evict each other's stock.

Flushing a memcg's stock on a CPU also means that all other stock
present on that CPU is also flushed, leading to poor caching for systems
running multiple memcgs competing for the same CPUs.

Finally, stock is tightly coupled with memcg, which means that all page
counters in a memcg share the same resource. This may simplify some of
the charging logic, but it prevents new page counters from being added
and using a separate stock.

We can address these concerns by pushing the concept of stock down to
the page_counter level, which addresses the random eviction problem by
getting rid of the 7 slot limit, and makes enabling separate stock
caches for other page_counters simpler.

Introduce a generic per-cpu stock directly in struct page_counter.
Stock can optionally be enabled per-page_counter, limiting the overhead
increase for page_counters who do not benefit greatly from caching
charges.

In this scheme, stock usage and refills happen via lockless atomic
operations, eliminating the need for asynchronous workqueues as well.
In this commit we introduce the alloc, free, and drain operations,
although they are unused for now.

Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
include/linux/page_counter.h | 15 +++++++++++++
mm/page_counter.c | 42 ++++++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index d649b6bbbc871..4abc7fe7c3494 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -5,8 +5,17 @@
#include <linux/atomic.h>
#include <linux/cache.h>
#include <linux/limits.h>
+#include <linux/percpu.h>
#include <asm/page.h>

+struct page_counter_stock {
+ /*
+ * Consumption/refills can only come from the owning cpu via
+ * atomic_cmpxchg. Remote access only happens on drain via atomic_xchg.
+ */
+ atomic_t nr_pages;
+};
+
struct page_counter {
/*
* Make sure 'usage' does not share cacheline with any other field in
@@ -41,6 +50,8 @@ struct page_counter {
unsigned long high;
unsigned long max;
struct page_counter *parent;
+ struct page_counter_stock __percpu *stock;
+ unsigned int batch;
} ____cacheline_internodealigned_in_smp;

#if BITS_PER_LONG == 32
@@ -99,6 +110,10 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
counter->watermark = usage;
}

+void page_counter_drain_stock(struct page_counter *counter, unsigned int cpu);
+int page_counter_alloc_stock(struct page_counter *counter, unsigned int batch);
+void page_counter_free_stock(struct page_counter *counter);
+
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
void page_counter_calculate_protection(struct page_counter *root,
struct page_counter *counter,
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 661e0f2a5127a..6bb48a913a90d 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -8,6 +8,7 @@
#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
+#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
@@ -289,6 +290,47 @@ int page_counter_memparse(const char *buf, const char *max,
return 0;
}

+void page_counter_drain_stock(struct page_counter *counter, unsigned int cpu)
+{
+ struct page_counter_stock *stock;
+ int nr_pages;
+
+ if (!counter->stock)
+ return;
+
+ stock = per_cpu_ptr(counter->stock, cpu);
+ nr_pages = atomic_xchg(&stock->nr_pages, 0);
+ if (nr_pages)
+ page_counter_uncharge(counter, nr_pages);
+}
+
+int page_counter_alloc_stock(struct page_counter *counter, unsigned int batch)
+{
+ struct page_counter_stock __percpu *stock;
+
+ stock = alloc_percpu(struct page_counter_stock);
+ if (!stock)
+ return -ENOMEM;
+
+ counter->stock = stock;
+ counter->batch = batch;
+
+ return 0;
+}
+
+void page_counter_free_stock(struct page_counter *counter)
+{
+ int cpu;
+
+ if (!counter->stock)
+ return;
+
+ for_each_possible_cpu(cpu)
+ page_counter_drain_stock(counter, cpu);
+
+ free_percpu(counter->stock);
+ counter->stock = NULL;
+}

#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
--
2.53.0-Meta