[PATCH 4/8] mm: memcontrol: track MEMCG_KMEM per NUMA node

From: Alexandre Ghiti

Date: Mon May 11 2026 - 16:28:51 EST


This patch gets rid of MEMCG_KMEM and wires all the "generic" functions
by introducing per-node obj_cgroup objects.

Note that it does not convert the kmem users to proper per-memcg-per-node
accounting now, this is done in upcoming patches.

Signed-off-by: Alexandre Ghiti <alex@xxxxxxxx>
---
include/linux/memcontrol.h | 23 ++++++++++----
include/linux/mmzone.h | 1 +
mm/memcontrol.c | 64 ++++++++++++++++++++++++--------------
mm/vmstat.c | 1 +
4 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 568ab08f42af..17cf823160e4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -35,7 +35,6 @@ enum memcg_stat_item {
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
MEMCG_PERCPU_B,
- MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
MEMCG_ZSWAP_INCOMP,
@@ -126,9 +125,10 @@ struct mem_cgroup_per_node {
struct list_head objcg_list;

#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
- /* slab stats for nmi context */
+ /* slab and kmem stats for nmi context */
atomic_t slab_reclaimable;
atomic_t slab_unreclaimable;
+ atomic_t kmem;
#endif
};

@@ -190,6 +190,7 @@ struct obj_cgroup {
struct rcu_head rcu;
};
bool is_root;
+ int nid;
};

/*
@@ -254,10 +255,6 @@ struct mem_cgroup {
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS];

-#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
- /* MEMCG_KMEM for nmi context */
- atomic_t kmem_stat;
-#endif
/*
* Hint of reclaim pressure for socket memroy management. Note
* that this indicator should NOT be used in legacy cgroup mode
@@ -776,6 +773,20 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt);
}

+static inline struct obj_cgroup *obj_cgroup_get_nid(struct obj_cgroup *objcg,
+ int nid)
+{
+ struct obj_cgroup *nid_objcg;
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ nid_objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);
+ rcu_read_unlock();
+
+ return nid_objcg;
+}
+
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
return !memcg || css_tryget(&memcg->css);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..97eb168fd7f3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -326,6 +326,7 @@ enum node_stat_item {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_KMEM,
NR_BALLOON_PAGES,
NR_KERNEL_FILE_PAGES,
NR_GPU_ACTIVE, /* Pages assigned to GPU objects */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index aaaa6a8b9f15..979a847e542a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -136,6 +136,7 @@ bool mem_cgroup_kmem_disabled(void)
}

static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
+static void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val);

static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -170,9 +171,11 @@ static void obj_cgroup_release(struct percpu_ref *ref)

if (nr_pages) {
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;

memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(objcg->nid));
+ mod_lruvec_state(lruvec, NR_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
memcg_uncharge(memcg, nr_pages);
@@ -423,13 +426,13 @@ static const unsigned int memcg_node_stat_items[] = {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_KMEM,
};

static const unsigned int memcg_stat_items[] = {
MEMCG_SWAP,
MEMCG_SOCK,
MEMCG_PERCPU_B,
- MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
MEMCG_ZSWAP_INCOMP,
@@ -1537,7 +1540,7 @@ struct memory_stat {
static const struct memory_stat memory_stats[] = {
{ "anon", NR_ANON_MAPPED },
{ "file", NR_FILE_PAGES },
- { "kernel", MEMCG_KMEM },
+ { "kernel", NR_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
{ "sec_pagetables", NR_SECONDARY_PAGETABLE },
@@ -3004,20 +3007,26 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
}

#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
-static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int nid, int val)
{
if (likely(!in_nmi())) {
- mod_memcg_state(memcg, MEMCG_KMEM, val);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+ mod_lruvec_state(lruvec, NR_KMEM, val);
} else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+
/* preemption is disabled in_nmi(). */
css_rstat_updated(&memcg->css, smp_processor_id());
- atomic_add(val, &memcg->kmem_stat);
+ atomic_add(val, &pn->kmem);
}
}
#else
-static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int nid, int val)
{
- mod_memcg_state(memcg, MEMCG_KMEM, val);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+ mod_lruvec_state(lruvec, NR_KMEM, val);
}
#endif

@@ -3033,7 +3042,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,

memcg = get_mem_cgroup_from_objcg(objcg);

- account_kmem_nmi_safe(memcg, -nr_pages);
+ account_kmem_nmi_safe(memcg, objcg->nid, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@@ -3061,7 +3070,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;

- account_kmem_nmi_safe(memcg, nr_pages);
+ account_kmem_nmi_safe(memcg, objcg->nid, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -3238,10 +3247,11 @@ static void drain_obj_stock(struct obj_stock_pcp *stock)

if (nr_pages) {
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;

memcg = get_mem_cgroup_from_objcg(old);
-
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(old->nid));
+ mod_lruvec_state(lruvec, NR_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
memcg_uncharge(memcg, nr_pages);
@@ -3250,7 +3260,7 @@ static void drain_obj_stock(struct obj_stock_pcp *stock)
}

/*
- * The leftover is flushed to the centralized per-memcg value.
+ * The leftover is flushed to the per-node per-memcg value.
* On the next attempt to refill obj stock it will be moved
* to a per-cpu stock (probably, on an other CPU), see
* refill_obj_stock().
@@ -3417,7 +3427,7 @@ void obj_cgroup_account_kmem(struct obj_cgroup *objcg, unsigned int nr_pages)

rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- account_kmem_nmi_safe(memcg, nr_pages);
+ account_kmem_nmi_safe(memcg, objcg->nid, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
rcu_read_unlock();
}
@@ -4165,6 +4175,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
objcg->is_root = true;

+ objcg->nid = nid;
objcg->memcg = memcg;
rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg);
obj_cgroup_get(objcg);
@@ -4369,15 +4380,6 @@ static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
{
int nid;

- if (atomic_read(&memcg->kmem_stat)) {
- int kmem = atomic_xchg(&memcg->kmem_stat, 0);
- int index = memcg_stats_index(MEMCG_KMEM);
-
- memcg->vmstats->state[index] += kmem;
- if (parent)
- parent->vmstats->state_pending[index] += kmem;
- }
-
for_each_node_state(nid, N_MEMORY) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
struct lruvec_stats *lstats = pn->lruvec_stats;
@@ -4408,6 +4410,18 @@ static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
if (parent)
parent->vmstats->state_pending[index] += slab;
}
+ if (atomic_read(&pn->kmem)) {
+ int kmem = atomic_xchg(&pn->kmem, 0);
+ int index = memcg_stats_index(NR_KMEM);
+
+ mod_node_page_state(NODE_DATA(nid), NR_KMEM, kmem);
+ lstats->state[index] += kmem;
+ memcg->vmstats->state[index] += kmem;
+ if (plstats)
+ plstats->state_pending[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
}
}
#else
@@ -5173,7 +5187,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
if (ug->nr_memory) {
memcg_uncharge(memcg, ug->nr_memory);
if (ug->nr_kmem) {
- mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem);
+ struct lruvec *lruvec =
+ mem_cgroup_lruvec(memcg, NODE_DATA(ug->objcg->nid));
+ mod_lruvec_state(lruvec, NR_KMEM, -ug->nr_kmem);
memcg1_account_kmem(memcg, -ug->nr_kmem);
}
memcg1_oom_recover(memcg);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f534972f517d..d55437d1852e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1293,6 +1293,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_HUGETLB_PAGE
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
+ [I(NR_KMEM)] = "nr_kmem",
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
[I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
[I(NR_GPU_ACTIVE)] = "nr_gpu_active",
--
2.54.0