[PATCH v2] mm/percpu, memcontrol: Per-memcg-lruvec percpu accounting

From: Joshua Hahn

Date: Fri Apr 03 2026 - 23:38:56 EST

enum memcg_stat_item includes memory that is tracked on a per-memcg
level, but not at a per-node (and per-lruvec) level. Diagnosing
memory pressure for memcgs in multi-NUMA systems can be difficult,
since not all of the memory accounted in memcg can be traced back
to a node. In scenarios where numa nodes in an memcg are asymmetrically
stressed, this difference can be invisible to the user.

Convert MEMCG_PERCPU_B from a memcg_stat_item to a memcg_node_stat_item
to give visibility into per-node breakdowns for percpu allocations.

This will get us closer to being able to know the memcg and physical
association of all memory on the system. Specifically for percpu, this
granularity will help demonstrate footprint differences on systems with
asymmetric NUMA nodes.

Because percpu memory is accounted at a sub-PAGE_SIZE level, we must
account node level statistics (accounted in PAGE_SIZE units) and
memcg-lruvec statistics separately. Account node statistics when the pcpu
pages are allocated, and account memcg-lruvec statistics when pcpu
objects are handed out.

To do account these separately, expose mod_memcg_lruvec_state to be
used outside of memcontrol.

The memory overhead of this patch is small; it adds 16 bytes
per-cgroup-node-cpu. For an example machine with 200 CPUs split across
2 nodes and 50 cgroups in the system, we see a 312.5 kB increase. Note
that this is the same cost as any other item in memcg_node_stat_item.

Performance impact is also negligible. These are results from a kernel
module which performs 100k percpu allocations via __alloc_percpu_gfp
with GFP_KERNEL | __GFP_ACCOUNT in a cgroup, across 20 trials.
Batched performs 100k allocations followed by 100k frees, while
interleaved performs allocation --> free --> allocation ...

+-------------+----------------+--------------+--------------+
| Test | linus-upstream | patch | diff |
+-------------+----------------+--------------+--------------+
| Batched | 6586 +/- 51 | 6595 +/- 35 | +9 (0.13%) |
| Interleaved | 1053 +/- 126 | 1085 +/- 113 | +32 (+0.85%) |
+-------------+----------------+--------------+--------------+

One functional change is that there can be a tiny inconsistency between
the size of the allocation used for memcg limit checking and what is
charged to each lruvec due to dropping fractional charges when rounding.
In reality this value is very very small and always lies on the side of
memory checking at a higher threshold, so there is no behavioral change
from userspace.

Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
v1 --> v2:
- Updated commit message to be more explicit about motivation, suggested
by Michal Hocko.
- Added performance and memory impacts, suggested by Michal Hocko and
Yosry Ahmed.
- Instead of completely dropping the "extra" overhead of obj_cgroup
pointers, they are now distributed among the nodes, proportional to
the number of CPU each node has.
---
include/linux/memcontrol.h | 4 +++-
include/linux/mmzone.h | 4 +++-
mm/memcontrol.c | 12 +++++-----
mm/percpu-vm.c | 14 ++++++++++--
mm/percpu.c | 45 ++++++++++++++++++++++++++++++++++----
mm/vmstat.c | 1 +
6 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0861589695298..96dae769c60d6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -34,7 +34,6 @@ struct kmem_cache;
enum memcg_stat_item {
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
- MEMCG_PERCPU_B,
MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
@@ -909,6 +908,9 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

+void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
void mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx, int val);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bd0134c241ce..e38d8fe8552b1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -328,6 +328,7 @@ enum node_stat_item {
#endif
NR_BALLOON_PAGES,
NR_KERNEL_FILE_PAGES,
+ NR_PERCPU_B,
NR_VM_NODE_STAT_ITEMS
};

@@ -365,7 +366,8 @@ static __always_inline bool vmstat_item_in_bytes(int idx)
* byte-precise.
*/
return (idx == NR_SLAB_RECLAIMABLE_B ||
- idx == NR_SLAB_UNRECLAIMABLE_B);
+ idx == NR_SLAB_UNRECLAIMABLE_B ||
+ idx == NR_PERCPU_B);
}

/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f1..b320b6a426966 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -377,6 +377,7 @@ static const unsigned int memcg_node_stat_items[] = {
NR_UNEVICTABLE,
NR_SLAB_RECLAIMABLE_B,
NR_SLAB_UNRECLAIMABLE_B,
+ NR_PERCPU_B,
WORKINGSET_REFAULT_ANON,
WORKINGSET_REFAULT_FILE,
WORKINGSET_ACTIVATE_ANON,
@@ -428,7 +429,6 @@ static const unsigned int memcg_node_stat_items[] = {
static const unsigned int memcg_stat_items[] = {
MEMCG_SWAP,
MEMCG_SOCK,
- MEMCG_PERCPU_B,
MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
@@ -920,9 +920,8 @@ static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
put_cpu();
}

-static void mod_memcg_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx,
- int val)
+void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pn;
@@ -936,6 +935,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,

get_non_dying_memcg_end();
}
+EXPORT_SYMBOL(mod_memcg_lruvec_state);

/**
* mod_lruvec_state - update lruvec memory statistics
@@ -1535,7 +1535,7 @@ static const struct memory_stat memory_stats[] = {
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
{ "sec_pagetables", NR_SECONDARY_PAGETABLE },
- { "percpu", MEMCG_PERCPU_B },
+ { "percpu", NR_PERCPU_B },
{ "sock", MEMCG_SOCK },
{ "vmalloc", NR_VMALLOC },
{ "shmem", NR_SHMEM },
@@ -1597,7 +1597,7 @@ static const struct memory_stat memory_stats[] = {
static int memcg_page_state_unit(int item)
{
switch (item) {
- case MEMCG_PERCPU_B:
+ case NR_PERCPU_B:
case MEMCG_ZSWAP_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 4f5937090590d..e36b639f521dd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -55,7 +55,8 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
{
unsigned int cpu;
- int i;
+ int nr_pages = page_end - page_start;
+ int i, nid;

for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
@@ -65,6 +66,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
__free_page(page);
}
}
+
+ for_each_node(nid)
+ mod_node_page_state(NODE_DATA(nid), NR_PERCPU_B,
+ -1L * nr_pages * nr_cpus_node(nid) * PAGE_SIZE);
}

/**
@@ -84,7 +89,8 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
gfp_t gfp)
{
unsigned int cpu, tcpu;
- int i;
+ int nr_pages = page_end - page_start;
+ int i, nid;

gfp |= __GFP_HIGHMEM;

@@ -97,6 +103,10 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
goto err;
}
}
+
+ for_each_node(nid)
+ mod_node_page_state(NODE_DATA(nid), NR_PERCPU_B,
+ nr_pages * nr_cpus_node(nid) * PAGE_SIZE);
return 0;

err:
diff --git a/mm/percpu.c b/mm/percpu.c
index b0676b8054ed0..51c160deca01a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1632,6 +1632,45 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
return true;
}

+/*
+ * pcpu_mod_memcg_lruvec - update per-node memcg percpu stats
+ * @objcg: object cgroup to charge
+ * @size: size of pcpu allocation
+ * @sign: 1 for charge, -1 for uncharge
+ *
+ * Charge percpu memory across NUMA nodes proportional to per-node CPU count.
+ * Includes the obj_cgroup pointer overhead (see pcpu_obj_full_size) from the
+ * chunk's obj_exts array, but spreads proportionally across all nodes to
+ * avoid attributing it to a single node.
+ *
+ * The "extra" size calculation is best-effort but deterministic.
+ * Charges will equal uncharges, although there may be small discrepancies
+ * due to rounding up/down.
+ */
+static void pcpu_mod_memcg_lruvec(struct obj_cgroup *objcg, size_t size,
+ int sign)
+{
+ struct mem_cgroup *memcg;
+ size_t extra = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+ int nid;
+
+ memcg = obj_cgroup_memcg(objcg);
+ for_each_node(nid) {
+ struct lruvec *lruvec;
+ unsigned int nr_cpus = nr_cpus_node(nid);
+ long charge;
+
+ if (!nr_cpus)
+ continue;
+
+ charge = nr_cpus * size +
+ mult_frac(extra, nr_cpus, num_possible_cpus());
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ mod_memcg_lruvec_state(lruvec, NR_PERCPU_B, sign * charge);
+ }
+}
+
static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
struct pcpu_chunk *chunk, int off,
size_t size)
@@ -1644,8 +1683,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

rcu_read_lock();
- mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
- pcpu_obj_full_size(size));
+ pcpu_mod_memcg_lruvec(objcg, size, 1);
rcu_read_unlock();
} else {
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
@@ -1667,8 +1705,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

rcu_read_lock();
- mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
- -pcpu_obj_full_size(size));
+ pcpu_mod_memcg_lruvec(objcg, size, -1);
rcu_read_unlock();

obj_cgroup_put(objcg);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b33097ab9bc81..d73c3355be715 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1296,6 +1296,7 @@ const char * const vmstat_text[] = {
#endif
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
[I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
+ [I(NR_PERCPU_B)] = "nr_percpu",
#undef I

/* system-wide enum vm_stat_item counters */
--
2.52.0