[PATCH v2 1/9] memcg: convert task->objcg to a per-node objcgs array

From: Alexandre Ghiti

Date: Fri Jun 26 2026 - 06:25:35 EST


From: Shakeel Butt <shakeel.butt@xxxxxxxxx>

Commit 01b9da291c49 ("mm: memcontrol: convert objcg to be per-memcg
per-node type") split a memcg's single obj_cgroup into one per NUMA
node, but task_struct still cached only one objcg. On every cross-node
allocation current_obj_cgroup() returned an objcg whose nid did not
match the current CPU's node, so the stock's per-node vmstat batching
and (separately) the per-node accounting hierarchy were defeated for
multi-node workloads.

Replace task->objcg with task->objcgs: a tagged pointer to an
nr_node_ids-sized array of per-node obj_cgroup pointers. Bit 0 keeps
its meaning as CURRENT_OBJCG_UPDATE_FLAG so mem_cgroup_kmem_attach()
can still atomically mark the cache stale from another task's context
with a single set_bit().

current_obj_cgroup() now indexes the array by numa_node_id() and falls
back to root_mem_cgroup on a NULL array (kthread or fork-time alloc
failure) or NULL entry (transient drain window).

current_objcg_update() refreshes every entry under one rcu_read_lock,
xchg'ing fresh per-node objcgs in and dropping the stale references.
The outer cmpxchg loop on the tagged array pointer preserves the
existing race-with-kmem_attach semantics: if the update bit is re-set
mid-refresh, the whole refresh is retried.

The array is eagerly allocated in mem_cgroup_fork() for non-kthread
tasks. This keeps current_objcg_update() off the allocation path, which
matters because it runs from kmem allocation contexts that may be
atomic. Kthreads and tasks whose fork-time kcalloc() fails simply leave
task->objcgs as NULL and route kmem allocations to root_mem_cgroup, as
before. The array is freed in mem_cgroup_exit() after dropping the
per-node references.

__get_obj_cgroup_from_memcg() takes nid as an explicit parameter so it
can be reused for both folio charging (numa_node_id()) and the per-node
refresh loop.

Signed-off-by: Shakeel Butt <shakeel.butt@xxxxxxxxx>
Signed-off-by: Alexandre Ghiti <alex@xxxxxxxx>
---
include/linux/sched.h | 7 +-
mm/memcontrol.c | 148 +++++++++++++++++++++++++-----------------
2 files changed, 95 insertions(+), 60 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee06cba5c6f5..d7ea9fe38d01 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1538,8 +1538,11 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;

- /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
- struct obj_cgroup *objcg;
+ /*
+ * Per-node cache for current->cgroups->memcg->nodeinfo[nid]->objcg
+ * lookups. Tagged pointer: bit 0 = CURRENT_OBJCG_UPDATE_FLAG.
+ */
+ struct obj_cgroup **objcgs;
#endif

#ifdef CONFIG_BLK_CGROUP
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 56cd4af08232..ee47427de9e2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2907,10 +2907,9 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p)
return folio_memcg_check(virt_to_folio(p));
}

-static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg,
+ int nid)
{
- int nid = numa_node_id();
-
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);

@@ -2926,67 +2925,73 @@ static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *me
struct obj_cgroup *objcg;

rcu_read_lock();
- objcg = __get_obj_cgroup_from_memcg(memcg);
+ objcg = __get_obj_cgroup_from_memcg(memcg, numa_node_id());
rcu_read_unlock();

return objcg;
}

-static struct obj_cgroup *current_objcg_update(void)
+static struct obj_cgroup **current_objcg_update(void)
{
struct mem_cgroup *memcg;
- struct obj_cgroup *old, *objcg = NULL;
+ struct obj_cgroup **objcgs;
+ unsigned long old_tagged;
+ int nid;

do {
- /* Atomically drop the update bit. */
- old = xchg(&current->objcg, NULL);
- if (old) {
- old = (struct obj_cgroup *)
- ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
- obj_cgroup_put(old);
-
- old = NULL;
- }
-
- /* If new objcg is NULL, no reason for the second atomic update. */
- if (!current->mm || (current->flags & PF_KTHREAD))
- return NULL;
+ old_tagged = (unsigned long)READ_ONCE(current->objcgs);
+ objcgs = (struct obj_cgroup **)
+ (old_tagged & ~CURRENT_OBJCG_UPDATE_FLAG);

/*
- * Release the objcg pointer from the previous iteration,
- * if try_cmpxcg() below fails.
+ * If there is no per-node cache (kthread or fork-time
+ * allocation failure), there is nothing to refresh. The
+ * cmpxchg below still clears the update bit so we do not
+ * keep re-entering this slow path.
*/
- if (unlikely(objcg)) {
- obj_cgroup_put(objcg);
- objcg = NULL;
+ if (objcgs) {
+ if (!current->mm || (current->flags & PF_KTHREAD)) {
+ /*
+ * The task lost its mm: drop the cached
+ * per-node references; future allocations will
+ * fall back to root_mem_cgroup.
+ */
+ for_each_node(nid)
+ obj_cgroup_put(xchg(&objcgs[nid], NULL));
+ } else {
+ /*
+ * Re-read the memcg under rcu since the task
+ * may have been asynchronously moved and the
+ * previous memcg can be offlined.
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ for_each_node(nid) {
+ struct obj_cgroup *fresh, *stale;
+
+ fresh = __get_obj_cgroup_from_memcg(memcg, nid);
+ stale = xchg(&objcgs[nid], fresh);
+ obj_cgroup_put(stale);
+ }
+ rcu_read_unlock();
+ }
}

/*
- * Obtain the new objcg pointer. The current task can be
- * asynchronously moved to another memcg and the previous
- * memcg can be offlined. So let's get the memcg pointer
- * and try get a reference to objcg under a rcu read lock.
- */
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(current);
- objcg = __get_obj_cgroup_from_memcg(memcg);
- rcu_read_unlock();
-
- /*
- * Try set up a new objcg pointer atomically. If it
- * fails, it means the update flag was set concurrently, so
- * the whole procedure should be repeated.
+ * Publish the cleared-flag pointer. If kmem_attach raced and
+ * re-set the update bit, retry the whole refresh.
*/
- } while (!try_cmpxchg(&current->objcg, &old, objcg));
+ } while (!try_cmpxchg((unsigned long *)&current->objcgs,
+ &old_tagged, (unsigned long)objcgs));

- return objcg;
+ return objcgs;
}

__always_inline struct obj_cgroup *current_obj_cgroup(void)
{
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ struct obj_cgroup **objcgs;
int nid = numa_node_id();

if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
@@ -2997,14 +3002,16 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
if (unlikely(memcg))
goto from_memcg;

- objcg = READ_ONCE(current->objcg);
- if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
- objcg = current_objcg_update();
+ objcgs = READ_ONCE(current->objcgs);
+ if (unlikely((unsigned long)objcgs & CURRENT_OBJCG_UPDATE_FLAG))
+ objcgs = current_objcg_update();
/*
- * Objcg reference is kept by the task, so it's safe
- * to use the objcg by the current task.
+ * Per-node objcg references are kept by the task, so it's
+ * safe to use them by the current task.
*/
- return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
+ if (objcgs && (objcg = objcgs[nid]))
+ return objcg;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}

memcg = this_cpu_read(int_active_memcg);
@@ -4544,22 +4551,47 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)

static void mem_cgroup_fork(struct task_struct *task)
{
+ struct obj_cgroup **objcgs;
+
/*
- * Set the update flag to cause task->objcg to be initialized lazily
- * on the first allocation. It can be done without any synchronization
- * because it's always performed on the current task, so does
- * current_objcg_update().
+ * Kthreads do not need a per-node cache; their kmem allocations fall
+ * back to root_mem_cgroup via current_obj_cgroup().
*/
- task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
+ if (task->flags & PF_KTHREAD) {
+ task->objcgs = NULL;
+ return;
+ }
+
+ /*
+ * Eagerly allocate the per-node cache so that current_objcg_update()
+ * never has to allocate from potentially-atomic kmem allocation
+ * paths. On allocation failure this task will use root_mem_cgroup
+ * for kmem accounting.
+ *
+ * Tag with the update flag so the first kmem allocation populates
+ * the entries via current_objcg_update().
+ */
+ objcgs = kcalloc(nr_node_ids, sizeof(*objcgs), GFP_KERNEL);
+ if (objcgs)
+ task->objcgs = (struct obj_cgroup **)
+ ((unsigned long)objcgs | CURRENT_OBJCG_UPDATE_FLAG);
+ else
+ task->objcgs = NULL;
}

static void mem_cgroup_exit(struct task_struct *task)
{
- struct obj_cgroup *objcg = task->objcg;
+ struct obj_cgroup **objcgs;
+ int nid;

- objcg = (struct obj_cgroup *)
- ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
- obj_cgroup_put(objcg);
+ objcgs = (struct obj_cgroup **)
+ ((unsigned long)task->objcgs & ~CURRENT_OBJCG_UPDATE_FLAG);
+
+ if (objcgs) {
+ for_each_node(nid)
+ obj_cgroup_put(objcgs[nid]);
+ kfree(objcgs);
+ }

/*
* Some kernel allocations can happen after this point,
@@ -4567,7 +4599,7 @@ static void mem_cgroup_exit(struct task_struct *task)
* because it's always performed on the current task, so does
* current_objcg_update().
*/
- task->objcg = NULL;
+ task->objcgs = NULL;
}

#ifdef CONFIG_LRU_GEN
@@ -4599,7 +4631,7 @@ static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)

cgroup_taskset_for_each(task, css, tset) {
/* atomically set the update bit */
- set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
+ set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcgs);
}
}

--
2.54.0