[PATCH RFC 1/4] mm/memcontrol: do not drain objcg stock when spinning is not allowed

From: Harry Yoo (Oracle)

Date: Wed Jun 24 2026 - 09:16:28 EST


When kmalloc_nolock() drains objcg stock, the stock might be holding
the last reference to the objcg. Since obj_cgroup_release() is a
callback for percpu refcount and does not know whether spinning is
allowed, it is not safe to invoke obj_cgroup_put().

This was caught by lockdep on PREEMPT_RT because acquiring
a sleeping lock (objcg_lock) violates lock nesting rules:

kernel: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
kernel: in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1267, name: systemd-resolve
preempt_count: 1, expected: 0
RCU nest depth: 3, expected: 3
6 locks held by systemd-resolve/1267:
#0: ffff888a8165fa20 ((&pcs->lock)){+.+.}-{3:3}, at: kmem_cache_alloc_noprof+0x185/0xa20
#1: ffffffff9658a4c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x74/0x2a0
#2: ffff888a81648598 ((lock)#4){+.+.}-{3:3}, at: trylock_stock+0x118/0x380
#3: ffffffff9658a4c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x74/0x2a0
#4: ffffffff9658a4c0 (rcu_read_lock){....}-{1:3}, at: percpu_ref_put_many.constprop.0+0x40/0x270
#5: ffffffff96af11d8 (objcg_lock){+.+.}-{3:3}, at: obj_cgroup_release+0x8a/0x410
[...]
Call Trace:
<TASK>
dump_stack_lvl+0x8a/0xe0
dump_stack+0x14/0x1c
__might_resched.cold+0x233/0x2bb
rt_spin_lock+0xd3/0x410
obj_cgroup_release+0x8a/0x410
percpu_ref_put_many.constprop.0+0x226/0x270
drain_obj_stock_slot+0x27e/0x8d0
__refill_obj_stock+0x409/0x6d0
__memcg_slab_post_alloc_hook+0xa45/0x1500
__kmalloc_nolock_noprof+0x988/0xc40
[...]

However, this is illegal in !RT kernels too because the objcg release
callback acquires a spinlock even when spinning is not allowed.

To fix this issue, fall back to atomics when the cached objcg doesn't
match, but it is unsafe to drain because spinning is not allowed.

This is expected to affect performance of kmalloc_nolock() since
it can no longer drain and refill the stock and falls back to a
per-objcg atomic counter (objcg->nr_charged_bytes).

Fixes: af92793e52c3 ("slab: Introduce kmalloc_nolock() and kfree_nolock().")
Cc: stable@xxxxxxxxxxxxxxx
Signed-off-by: Harry Yoo (Oracle) <harry@xxxxxxxxxx>
---
mm/memcontrol.c | 34 +++++++++++++++++++++++-----------
mm/slab.h | 3 ++-
mm/slub.c | 29 +++++++++++++++++++----------
3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29390ba13baa..5bb5e75ef5b0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3316,18 +3316,19 @@ static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
static void __refill_obj_stock(struct obj_cgroup *objcg,
struct obj_stock_pcp *stock,
unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge,
+ bool allow_spin)
{
unsigned int nr_pages = 0;

- if (!stock) {
- nr_pages = nr_bytes >> PAGE_SHIFT;
- nr_bytes = nr_bytes & (PAGE_SIZE - 1);
- atomic_add(nr_bytes, &objcg->nr_charged_bytes);
- goto out;
- }
+ if (!stock)
+ goto fallback;

if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
+ /* Not safe to drain since objcg release acquires spinlock */
+ if (unlikely(!allow_spin))
+ goto fallback;
+
drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@ -3346,6 +3347,13 @@ static void __refill_obj_stock(struct obj_cgroup *objcg,
out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
+ return;
+
+fallback:
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
}

static void refill_obj_stock(struct obj_cgroup *objcg,
@@ -3353,7 +3361,8 @@ static void refill_obj_stock(struct obj_cgroup *objcg,
bool allow_uncharge)
{
struct obj_stock_pcp *stock = trylock_stock();
- __refill_obj_stock(objcg, stock, nr_bytes, allow_uncharge);
+ __refill_obj_stock(objcg, stock, nr_bytes, allow_uncharge,
+ /* allow_spin = */ true);
unlock_stock(stock);
}

@@ -3428,6 +3437,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
size_t size, void **p)
{
size_t obj_size = obj_full_size(s);
+ bool allow_spin = alloc_flags_allow_spinning(slab_alloc_flags);
struct obj_cgroup *objcg;
struct slab *slab;
unsigned long off;
@@ -3497,7 +3507,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
stock = trylock_stock();
if (remainder)
- __refill_obj_stock(objcg, stock, remainder, false);
+ __refill_obj_stock(objcg, stock, remainder, false,
+ allow_spin);
}
__account_obj_stock(objcg, stock, obj_size,
slab_pgdat(slab), cache_vmstat_idx(s));
@@ -3516,7 +3527,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
}

void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- void **p, int objects, unsigned long obj_exts)
+ void **p, int objects, unsigned long obj_exts,
+ bool allow_spin)
{
size_t obj_size = obj_full_size(s);

@@ -3535,7 +3547,7 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
obj_ext->objcg = NULL;

stock = trylock_stock();
- __refill_obj_stock(objcg, stock, obj_size, true);
+ __refill_obj_stock(objcg, stock, obj_size, true, allow_spin);
__account_obj_stock(objcg, stock, -obj_size,
slab_pgdat(slab), cache_vmstat_idx(s));
unlock_stock(stock);
diff --git a/mm/slab.h b/mm/slab.h
index 281a65233795..a6b4ac298d08 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -660,7 +660,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
gfp_t flags, unsigned int slab_alloc_flags,
size_t size, void **p);
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- void **p, int objects, unsigned long obj_exts);
+ void **p, int objects, unsigned long obj_exts,
+ bool allow_spin);
#endif

void kvfree_rcu_cb(struct rcu_head *head);
diff --git a/mm/slub.c b/mm/slub.c
index 917635203f73..32672a92581b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2488,7 +2488,7 @@ bool memcg_slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,

static __fastpath_inline
void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
- int objects)
+ int objects, bool allow_spin)
{
unsigned long obj_exts;

@@ -2500,7 +2500,7 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
return;

get_slab_obj_exts(obj_exts);
- __memcg_slab_free_hook(s, slab, p, objects, obj_exts);
+ __memcg_slab_free_hook(s, slab, p, objects, obj_exts, allow_spin);
put_slab_obj_exts(obj_exts);
}

@@ -2575,7 +2575,7 @@ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
}

static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- void **p, int objects)
+ void **p, int objects, bool allow_spin)
{
}

@@ -2946,11 +2946,12 @@ static bool __rcu_free_sheaf_prepare(struct kmem_cache *s,
void **p = &sheaf->objects[0];
unsigned int i = 0;
bool pfmemalloc = false;
+ bool allow_spin = true;

while (i < sheaf->size) {
struct slab *slab = virt_to_slab(p[i]);

- memcg_slab_free_hook(s, slab, p + i, 1);
+ memcg_slab_free_hook(s, slab, p + i, 1, allow_spin);
alloc_tagging_slab_free_hook(s, slab, p + i, 1);

if (unlikely(!slab_free_hook(s, p[i], init, true))) {
@@ -6215,12 +6216,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
struct node_barn *barn;
void *remote_objects[PCS_BATCH_MAX];
unsigned int remote_nr = 0;
+ bool allow_spin = true;

next_remote_batch:
while (i < size) {
struct slab *slab = virt_to_slab(p[i]);

- memcg_slab_free_hook(s, slab, p + i, 1);
+ memcg_slab_free_hook(s, slab, p + i, 1, allow_spin);
alloc_tagging_slab_free_hook(s, slab, p + i, 1);

if (unlikely(!slab_free_hook(s, p[i], init, false))) {
@@ -6398,13 +6400,16 @@ static __fastpath_inline
void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
unsigned long addr)
{
- memcg_slab_free_hook(s, slab, &object, 1);
+ bool allow_spin = true;
+
+ memcg_slab_free_hook(s, slab, &object, 1, allow_spin);
alloc_tagging_slab_free_hook(s, slab, &object, 1);

if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
return;

- if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, object, true)))
+ if (likely(can_free_to_pcs(slab)) &&
+ likely(free_to_pcs(s, object, allow_spin)))
return;

__slab_free(s, slab, object, object, 1, addr);
@@ -6429,7 +6434,9 @@ static __fastpath_inline
void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
void *tail, void **p, int cnt, unsigned long addr)
{
- memcg_slab_free_hook(s, slab, p, cnt);
+ bool allow_spin = true;
+
+ memcg_slab_free_hook(s, slab, p, cnt, allow_spin);
alloc_tagging_slab_free_hook(s, slab, p, cnt);
/*
* With KASAN enabled slab_free_freelist_hook modifies the freelist
@@ -6734,6 +6741,7 @@ void kfree_nolock(const void *object)
struct slab *slab;
struct kmem_cache *s;
void *x = (void *)object;
+ bool allow_spin = false;

if (unlikely(ZERO_OR_NULL_PTR(object)))
return;
@@ -6746,7 +6754,7 @@ void kfree_nolock(const void *object)

s = slab->slab_cache;

- memcg_slab_free_hook(s, slab, &x, 1);
+ memcg_slab_free_hook(s, slab, &x, 1, allow_spin);
alloc_tagging_slab_free_hook(s, slab, &x, 1);
/*
* Unlike slab_free() do NOT call the following:
@@ -6776,7 +6784,8 @@ void kfree_nolock(const void *object)
*/
kasan_slab_free(s, x, false, false, /* skip quarantine */true);

- if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, x, false)))
+ if (likely(can_free_to_pcs(slab)) &&
+ likely(free_to_pcs(s, x, allow_spin)))
return;

/*

--
2.53.0