Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()

From: hu.shengming

Date: Wed Jun 24 2026 - 05:25:15 EST

Harry wrote:
> Currently, k[v]free_rcu() cannot be called in unknown context since
> it could lead to a deadlock when called in the middle of k[v]free_rcu().
>
> Make users' lives easier by introducing kfree_rcu_nolock() variant,
> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
> __kfree_rcu_sheaf() handles unknown context.
>
> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
> the kvfree_rcu batching when the sheaves path fails, and falls back to
> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
> to succeed and it's unnecessary to add complexity to the existing
> kvfree_rcu batching.
>
> Since defer_kfree_rcu() can be called on caches without sheaves, move
> deferred_work_barrier() and rcu_barrier() outside the branch in
> kvfree_rcu_barrier_on_cache().
>
> Signed-off-by: Harry Yoo (Oracle) <harry@xxxxxxxxxx>

Hi Harry,

Thanks for the series. These patches fill a clear functional gap in the
existing free APIs by adding an RCU-deferred free interface for contexts
where kfree_rcu() cannot safely be used.

> ---
> include/linux/rcupdate.h | 12 ++++++++++++
> mm/slab.h | 1 +
> mm/slab_common.c | 22 ++++++++++++++++++++--
> mm/slub.c | 23 ++++++++++++++++++++++-
> 4 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 5e95acc33989..3025249bfcb5 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1099,6 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
> * In mm/slab_common.c, no suitable header to include here.
> */
> void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
>
> /*
> * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
> @@ -1122,6 +1123,17 @@ do { \
> kvfree_call_rcu(NULL, (void *) (___p)); \
> } while (0)
>
> +/* kfree_rcu_nolock() supports 2-arg variant only */
> +#define kfree_rcu_nolock(ptr, krhf) \
> +do { \
> + typeof (ptr) ___p = (ptr); \
> + \
> + if (___p) { \
> + BUILD_BUG_ON(offsetof(typeof(*(ptr)), krhf) >= 4096); \
> + kfree_call_rcu_nolock(&((___p)->krhf), (void *) (___p));\
> + } \
> +} while (0)
> +
> /*
> * Place this after a lock-acquisition primitive to guarantee that
> * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies
> diff --git a/mm/slab.h b/mm/slab.h
> index 961581e35ec8..a493c5201e96 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -745,6 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
> const struct slab *slab, bool to_user);
>
> void deferred_work_barrier(void);
> +void defer_kfree_rcu(struct rcu_head *head);
>
> static inline bool slub_debug_orig_size(struct kmem_cache *s)
> {
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 807924a94fb0..5a39e6225160 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
> EXPORT_TRACEPOINT_SYMBOL(kfree);
> EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
>
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
> +{
> + struct slab *slab;
> + struct kmem_cache *s;
> +
> + VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
> +
> + slab = virt_to_slab(ptr);
> + s = slab->slab_cache;
> +
> + if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
> + return;
> +

One consistency issue to address here: kfree_rcu_sheaf() only calls
__kfree_rcu_sheaf() for objects belonging to the local NUMA node. This
avoids filling a CPU's per-CPU sheaves with objects from remote slabs.

kfree_call_rcu_nolock() currently skips that check and may therefore
place remote-node objects into the local CPU's RCU sheaf.

Could you add the same local-node check used by kfree_rcu_sheaf()
before calling __kfree_rcu_sheaf(), and route remote-node objects
directly to the defer_kfree_rcu() fallback path instead?

--
With Best Regards,
Shengming

> + defer_kfree_rcu(head);
> +}
> +EXPORT_SYMBOL_GPL(kfree_call_rcu_nolock);
> +
> #ifndef CONFIG_KVFREE_RCU_BATCHED
>
> void kvfree_call_rcu(struct rcu_head *head, void *ptr)
> @@ -2120,10 +2137,11 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
> cpus_read_lock();
> flush_rcu_sheaves_on_cache(s);
> cpus_read_unlock();
> - deferred_work_barrier();
> - rcu_barrier();
> }
>
> + /* kfree_rcu_nolock() might have deferred frees even without sheaves */
> + deferred_work_barrier();
> + rcu_barrier();
> __kvfree_rcu_barrier();
> }
> EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
> diff --git a/mm/slub.c b/mm/slub.c
> index 4850629774b2..19018a979445 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4075,6 +4075,7 @@ static void flush_all(struct kmem_cache *s)
>
> struct deferred_percpu_work {
> struct llist_head objects;
> + struct llist_head objects_by_rcu;
> struct llist_head rcu_sheaves;
> struct irq_work work;
> };
> @@ -4083,6 +4084,7 @@ static void deferred_percpu_work_fn(struct irq_work *work);
>
> static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
> .objects = LLIST_HEAD_INIT(objects),
> + .objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
> .rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
> .work = IRQ_WORK_INIT(deferred_percpu_work_fn),
> };
> @@ -6392,12 +6394,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
> static void deferred_percpu_work_fn(struct irq_work *work)
> {
> struct deferred_percpu_work *dpw;
> - struct llist_head *objs, *rcu_sheaves;
> + struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
> struct llist_node *llnode, *pos, *t;
>
> dpw = container_of(work, struct deferred_percpu_work, work);
> rcu_sheaves = &dpw->rcu_sheaves;
> objs = &dpw->objects;
> + objs_by_rcu = &dpw->objects_by_rcu;
>
> llnode = llist_del_all(objs);
> llist_for_each_safe(pos, t, llnode) {
> @@ -6428,6 +6431,13 @@ static void deferred_percpu_work_fn(struct irq_work *work)
>
> call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
> }
> +
> + llnode = llist_del_all(objs_by_rcu);
> + llist_for_each_safe(pos, t, llnode) {
> + struct rcu_head *head = (struct rcu_head *)pos;
> +
> + call_rcu(head, kvfree_rcu_cb);
> + }
> }
>
> static void defer_free(struct kmem_cache *s, void *head)
> @@ -6443,6 +6453,17 @@ static void defer_free(struct kmem_cache *s, void *head)
> irq_work_queue(&dpw->work);
> }
>
> +void defer_kfree_rcu(struct rcu_head *head)
> +{
> + struct deferred_percpu_work *dpw;
> +
> + guard(preempt)();
> +
> + dpw = this_cpu_ptr(&deferred_percpu_work);
> + if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
> + irq_work_queue(&dpw->work);
> +}
> +
> void deferred_work_barrier(void)
> {
> int cpu;
>
> --
> 2.53.0