[PATCH for-next v3 5/9] mm/slab: extend deferred free mechanism to handle rcu sheaves

From: Harry Yoo (Oracle)

Date: Mon Jun 15 2026 - 07:10:10 EST

__kfree_rcu_sheaf() cannot invoke call_rcu() when spinning is not
allowed and IRQs are disabled. To relax the limitation, extend the
deferred free fallback so that a full rcu sheaf can be submitted to
call_rcu() via the existing IRQ work.

Since the deferred mechanism does more than deferred free of objects,
rename the struct to deferred_percpu_work and adjust names accordingly.

When a sheaf is queued on an IRQ work, it is detached from
pcs->rcu_free but call_rcu() is not invoked until the irq_work runs.
To keep the kvfree_rcu barrier's promise, call irq_work_sync() on each
CPU before calling rcu_barrier().

In the meantime, remove the TODO item as apparently there is no simple
and effective way to achieve that.

Suggested-by: Alexei Starovoitov <ast@xxxxxxxxxx>
Signed-off-by: Harry Yoo (Oracle) <harry@xxxxxxxxxx>
---
mm/slab.h | 2 +-
mm/slab_common.c | 7 ++---
mm/slub.c | 79 ++++++++++++++++++++++++++++++++++----------------------
3 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index b1bd33a16544..961581e35ec8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -744,7 +744,7 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);

-void defer_free_barrier(void);
+void deferred_work_barrier(void);

static inline bool slub_debug_orig_size(struct kmem_cache *s)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index bc1a8ec938d9..55546b8385ff 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -551,7 +551,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
}

/* Wait for deferred work from kmalloc/kfree_nolock() */
- defer_free_barrier();
+ deferred_work_barrier();

cpus_read_lock();
mutex_lock(&slab_mutex);
@@ -2113,13 +2113,10 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
cpus_read_lock();
flush_rcu_sheaves_on_cache(s);
cpus_read_unlock();
+ deferred_work_barrier();
rcu_barrier();
}

- /*
- * TODO: Introduce a version of __kvfree_rcu_barrier() that works
- * on a specific slab cache.
- */
__kvfree_rcu_barrier();
}
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
diff --git a/mm/slub.c b/mm/slub.c
index 6a3552b70683..ba593c1c53d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -418,6 +418,8 @@ struct slab_sheaf {
union {
struct rcu_head rcu_head;
struct list_head barn_list;
+ /* only used to defer call_rcu() in unknown context */
+ struct llist_node llnode;
/* only used for prefilled sheafs */
struct {
unsigned int capacity;
@@ -4071,6 +4073,20 @@ static void flush_all(struct kmem_cache *s)
cpus_read_unlock();
}

+struct deferred_percpu_work {
+ struct llist_head objects;
+ struct llist_head rcu_sheaves;
+ struct irq_work work;
+};
+
+static void deferred_percpu_work_fn(struct irq_work *work);
+
+static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
+ .objects = LLIST_HEAD_INIT(objects),
+ .rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
+ .work = IRQ_WORK_INIT(deferred_percpu_work_fn),
+};
+
static void flush_rcu_sheaf(struct work_struct *w)
{
struct slub_percpu_sheaves *pcs;
@@ -4142,6 +4158,7 @@ void flush_all_rcu_sheaves(void)
mutex_unlock(&slab_mutex);
cpus_read_unlock();

+ deferred_work_barrier();
rcu_barrier();
}

@@ -6158,12 +6175,6 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj, bool allow_spin)
if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
rcu_sheaf = NULL;
} else {
- /* call_rcu() disables IRQs to protect percpu data structures */
- if (unlikely(!allow_spin && irqs_disabled())) {
- rcu_sheaf->size--;
- local_unlock(&s->cpu_sheaves->lock);
- goto fail;
- }
pcs->rcu_free = NULL;
rcu_sheaf->node = numa_node_id();
}
@@ -6172,8 +6183,18 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj, bool allow_spin)
* we flush before local_unlock to make sure a racing
* flush_all_rcu_sheaves() doesn't miss this sheaf
*/
- if (rcu_sheaf)
- call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+ if (rcu_sheaf) {
+ /* call_rcu() disables IRQs to protect percpu data structures */
+ if (unlikely(!allow_spin && irqs_disabled())) {
+ struct deferred_percpu_work *dpw;
+
+ dpw = this_cpu_ptr(&deferred_percpu_work);
+ if (llist_add(&rcu_sheaf->llnode, &dpw->rcu_sheaves))
+ irq_work_queue(&dpw->work);
+ } else {
+ call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+ }
+ }

local_unlock(&s->cpu_sheaves->lock);

@@ -6360,31 +6381,20 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
}
}

-struct defer_free {
- struct llist_head objects;
- struct irq_work work;
-};
-
-static void free_deferred_objects(struct irq_work *work);
-
-static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
- .objects = LLIST_HEAD_INIT(objects),
- .work = IRQ_WORK_INIT(free_deferred_objects),
-};
-
/*
* In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
* to take sleeping spin_locks from __slab_free().
* In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
*/
-static void free_deferred_objects(struct irq_work *work)
+static void deferred_percpu_work_fn(struct irq_work *work)
{
- struct defer_free *df = container_of(work, struct defer_free, work);
- struct llist_head *objs = &df->objects;
+ struct deferred_percpu_work *dpw;
+ struct llist_head *objs, *rcu_sheaves;
struct llist_node *llnode, *pos, *t;

- if (llist_empty(objs))
- return;
+ dpw = container_of(work, struct deferred_percpu_work, work);
+ rcu_sheaves = &dpw->rcu_sheaves;
+ objs = &dpw->objects;

llnode = llist_del_all(objs);
llist_for_each_safe(pos, t, llnode) {
@@ -6408,27 +6418,34 @@ static void free_deferred_objects(struct irq_work *work)
__slab_free(s, slab, x, x, 1, _THIS_IP_);
stat(s, FREE_SLOWPATH);
}
+
+ llnode = llist_del_all(rcu_sheaves);
+ llist_for_each_safe(pos, t, llnode) {
+ struct slab_sheaf *rcu_sheaf = llist_entry(pos, struct slab_sheaf, llnode);
+
+ call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+ }
}

static void defer_free(struct kmem_cache *s, void *head)
{
- struct defer_free *df;
+ struct deferred_percpu_work *dpw;

guard(preempt)();

head = kasan_reset_tag(head);

- df = this_cpu_ptr(&defer_free_objects);
- if (llist_add(head + s->offset, &df->objects))
- irq_work_queue(&df->work);
+ dpw = this_cpu_ptr(&deferred_percpu_work);
+ if (llist_add(head + s->offset, &dpw->objects))
+ irq_work_queue(&dpw->work);
}

-void defer_free_barrier(void)
+void deferred_work_barrier(void)
{
int cpu;

for_each_possible_cpu(cpu)
- irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
+ irq_work_sync(&per_cpu_ptr(&deferred_percpu_work, cpu)->work);
}

static __fastpath_inline

--
2.53.0