Re: [PATCH v2] mm/slub: skip freelist construction for whole-slab bulk refill
From: Hao Li
Date: Wed Apr 01 2026 - 02:59:29 EST
On Wed, Apr 01, 2026 at 12:57:25PM +0800, hu.shengming@xxxxxxxxxx wrote:
> From: Shengming Hu <hu.shengming@xxxxxxxxxx>
>
> refill_objects() already notes that a whole-slab bulk refill could avoid
> building a freelist that would be drained immediately.
>
> When the remaining bulk allocation is large enough to consume an entire
> new slab, building the freelist is unnecessary overhead. Instead,
> allocate the slab without initializing its freelist and hand all objects
> directly to the caller.
>
> Handle CONFIG_SLAB_FREELIST_RANDOM=y as well by walking objects in the
> randomized allocation order and placing them directly into the caller's
> array, without constructing a temporary freelist.
>
> Also mark setup_object() inline. After this optimization, the compiler no
> longer consistently inlines this helper in the hot path, which can hurt
> performance. Explicitly marking it inline restores the expected code
> generation.
>
> This reduces per-object overhead in bulk allocation paths and improves
> allocation throughput significantly. In slub_bulk_bench, the time per
> object drops by about 54% to 74% with CONFIG_SLAB_FREELIST_RANDOM=n, and
> by about 62% to 74% with CONFIG_SLAB_FREELIST_RANDOM=y.
Thanks for the patch.
Here are some quick review..
>
> Benchmark results (slub_bulk_bench):
>
> Machine: qemu-system-x86 -m 1024M -smp 8 -enable-kvm -cpu host
> Kernel: Linux 7.0.0-rc6-next-20260330
> Config: x86_64_defconfig
> Cpu: 0
> Rounds: 20
> Total: 256MB
>
> - CONFIG_SLAB_FREELIST_RANDOM=n -
>
> obj_size=16, batch=256:
> before: 5.29 +- 0.73 ns/object
> after: 2.42 +- 0.05 ns/object
> delta: -54.4%
>
> obj_size=32, batch=128:
> before: 7.65 +- 1.89 ns/object
> after: 3.04 +- 0.03 ns/object
> delta: -60.2%
>
> obj_size=64, batch=64:
> before: 11.07 +- 0.08 ns/object
> after: 4.11 +- 0.04 ns/object
> delta: -62.9%
>
> obj_size=128, batch=32:
> before: 19.95 +- 0.30 ns/object
> after: 5.72 +- 0.05 ns/object
> delta: -71.3%
>
> obj_size=256, batch=32:
> before: 24.31 +- 0.25 ns/object
> after: 6.33 +- 0.14 ns/object
> delta: -74.0%
>
> obj_size=512, batch=32:
> before: 22.48 +- 0.14 ns/object
> after: 6.43 +- 0.10 ns/object
> delta: -71.4%
>
> - CONFIG_SLAB_FREELIST_RANDOM=y -
>
> obj_size=16, batch=256:
> before: 9.32 +- 1.26 ns/object
> after: 3.51 +- 0.02 ns/object
> delta: -62.4%
>
> obj_size=32, batch=128:
> before: 11.68 +- 0.15 ns/object
> after: 4.18 +- 0.22 ns/object
> delta: -64.2%
>
> obj_size=64, batch=64:
> before: 16.69 +- 1.36 ns/object
> after: 5.22 +- 0.06 ns/object
> delta: -68.7%
>
> obj_size=128, batch=32:
> before: 23.41 +- 0.23 ns/object
> after: 7.40 +- 0.07 ns/object
> delta: -68.4%
>
> obj_size=256, batch=32:
> before: 29.80 +- 0.44 ns/object
> after: 7.98 +- 0.09 ns/object
> delta: -73.2%
>
> obj_size=512, batch=32:
> before: 30.38 +- 0.36 ns/object
> after: 8.01 +- 0.06 ns/object
> delta: -73.6%
>
> Link: https://github.com/HSM6236/slub_bulk_test.git
> Signed-off-by: Shengming Hu <hu.shengming@xxxxxxxxxx>
> ---
> Changes in v2:
> - Handle CONFIG_SLAB_FREELIST_RANDOM=y and add benchmark results.
> - Update the QEMU benchmark setup to use -enable-kvm -cpu host so benchmark results better reflect native CPU performance.
> - Link to v1: https://lore.kernel.org/all/20260328125538341lvTGRpS62UNdRiAAz2gH3@xxxxxxxxxx/
>
> ---
> mm/slub.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 136 insertions(+), 19 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index fb2c5c57bc4e..52da4a716b1b 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2733,7 +2733,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
> return *head != NULL;
> }
>
> -static void *setup_object(struct kmem_cache *s, void *object)
> +static inline void *setup_object(struct kmem_cache *s, void *object)
> {
> setup_object_debug(s, object);
> object = kasan_init_slab_obj(s, object);
> @@ -3399,6 +3399,53 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
>
> return true;
> }
> +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
> + void *obj);
> +
> +static inline bool alloc_whole_from_new_slab_random(struct kmem_cache *s,
> + struct slab *slab, void **p,
> + bool allow_spin,
> + unsigned int *allocatedp)
> +{
> + unsigned long pos, page_limit, freelist_count;
> + unsigned int allocated = 0;
> + void *next, *start;
> +
> + if (slab->objects < 2 || !s->random_seq)
> + return false;
> +
> + freelist_count = oo_objects(s->oo);
> +
> + if (allow_spin) {
> + pos = get_random_u32_below(freelist_count);
> + } else {
> + struct rnd_state *state;
> +
> + /*
> + * An interrupt or NMI handler might interrupt and change
> + * the state in the middle, but that's safe.
> + */
> + state = &get_cpu_var(slab_rnd_state);
> + pos = prandom_u32_state(state) % freelist_count;
> + put_cpu_var(slab_rnd_state);
> + }
> +
> + page_limit = slab->objects * s->size;
> + start = fixup_red_left(s, slab_address(slab));
> +
> + while (allocated < slab->objects) {
> + next = next_freelist_entry(s, &pos, start, page_limit,
> + freelist_count);
> + next = setup_object(s, next);
> + p[allocated] = next;
> + maybe_wipe_obj_freeptr(s, next);
> + allocated++;
> + }
> +
> + *allocatedp = allocated;
It seems we does not need to return the allocated count through allocatedp,
since the count should always be slab->objects.
> + return true;
> +}
> +
> #else
> static inline int init_cache_random_seq(struct kmem_cache *s)
> {
> @@ -3410,6 +3457,14 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
> {
> return false;
> }
> +
> +static inline bool alloc_whole_from_new_slab_random(struct kmem_cache *s,
> + struct slab *slab, void **p,
> + bool allow_spin,
> + unsigned int *allocatedp)
> +{
> + return false;
> +}
> #endif /* CONFIG_SLAB_FREELIST_RANDOM */
>
> static __always_inline void account_slab(struct slab *slab, int order,
> @@ -3438,7 +3493,8 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
> -(PAGE_SIZE << order));
> }
>
> -static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node,
> + bool build_freelist, bool *allow_spinp)
> {
> bool allow_spin = gfpflags_allow_spinning(flags);
> struct slab *slab;
> @@ -3446,7 +3502,10 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> gfp_t alloc_gfp;
> void *start, *p, *next;
> int idx;
> - bool shuffle;
> + bool shuffle = false;
> +
> + if (allow_spinp)
> + *allow_spinp = allow_spin;
It seems unnecessary for allocate_slab() to compute allow_spin and return it
via allow_spinp.
We could instead calculate it directly in refill_objects() based on gfp.
>
> flags &= gfp_allowed_mask;
>
> @@ -3483,6 +3542,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> slab->frozen = 0;
>
> slab->slab_cache = s;
> + slab->freelist = NULL;
>
> kasan_poison_slab(slab);
>
> @@ -3497,9 +3557,10 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> alloc_slab_obj_exts_early(s, slab);
> account_slab(slab, oo_order(oo), s, flags);
>
> - shuffle = shuffle_freelist(s, slab, allow_spin);
> + if (build_freelist)
> + shuffle = shuffle_freelist(s, slab, allow_spin);
>
> - if (!shuffle) {
> + if (build_freelist && !shuffle) {
> start = fixup_red_left(s, start);
> start = setup_object(s, start);
> slab->freelist = start;
> @@ -3515,7 +3576,8 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> return slab;
> }
>
> -static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node,
> + bool build_freelist, bool *allow_spinp)
> {
> if (unlikely(flags & GFP_SLAB_BUG_MASK))
> flags = kmalloc_fix_flags(flags);
> @@ -3523,7 +3585,8 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
>
> return allocate_slab(s,
> - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
> + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK),
> + node, build_freelist, allow_spinp);
> }
>
> static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin)
> @@ -4395,6 +4458,48 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
> return allocated;
> }
>
> +static unsigned int alloc_whole_from_new_slab(struct kmem_cache *s,
> + struct slab *slab, void **p, bool allow_spin)
> +{
> +
> + unsigned int allocated = 0;
> + void *object, *start;
> +
> + if (alloc_whole_from_new_slab_random(s, slab, p, allow_spin,
> + &allocated)) {
> + goto done;
> + }
> +
> + start = fixup_red_left(s, slab_address(slab));
> + object = setup_object(s, start);
> +
> + while (allocated < slab->objects - 1) {
> + p[allocated] = object;
> + maybe_wipe_obj_freeptr(s, object);
> +
> + allocated++;
> + object += s->size;
> + object = setup_object(s, object);
> + }
Also, I feel the current patch contains some duplicated code like this loop.
Would it make sense to split allocate_slab() into two functions?
For example,
the first part could be called allocate_slab_meta_setup() (just an example name)
And, the second part could be allocate_slab_objects_setup(), with the core logic
being the loop over objects. Then allocate_slab_objects_setup() could support
two modes: one called BUILD_FREELIST, which builds the freelist, and another
called EMIT_OBJECTS, which skips building the freelist and directly places the
objects into the target array.
> +
> + p[allocated] = object;
> + maybe_wipe_obj_freeptr(s, object);
> + allocated++;
> +
> +done:
> + slab->freelist = NULL;
> + slab->inuse = slab->objects;
> + inc_slabs_node(s, slab_nid(slab), slab->objects);
> +
> + return allocated;
> +}
> +
> +static inline bool bulk_refill_consumes_whole_slab(struct kmem_cache *s,
> + unsigned int count)
> +{
> + return count >= oo_objects(s->oo);
It seems using s->oo here may be a bit too strict. In allocate_slab(), the
object count can fall back to s->min, so using s->objects might be more
reasonable (If I understand correctly...).
> +}
> +
> /*
> * Slow path. We failed to allocate via percpu sheaves or they are not available
> * due to bootstrap or debugging enabled or SLUB_TINY.
> @@ -4441,7 +4546,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> if (object)
> goto success;
>
> - slab = new_slab(s, pc.flags, node);
> + slab = new_slab(s, pc.flags, node, true, NULL);
>
> if (unlikely(!slab)) {
> if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
> @@ -7244,18 +7349,30 @@ refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
>
> new_slab:
>
> - slab = new_slab(s, gfp, local_node);
> - if (!slab)
> - goto out;
> -
> - stat(s, ALLOC_SLAB);
> -
> /*
> - * TODO: possible optimization - if we know we will consume the whole
> - * slab we might skip creating the freelist?
> + * If the remaining bulk allocation is large enough to consume
> + * an entire slab, avoid building the freelist only to drain it
> + * immediately. Instead, allocate a slab without a freelist and
> + * hand out all objects directly.
> */
> - refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
> - /* allow_spin = */ true);
> + if (bulk_refill_consumes_whole_slab(s, max - refilled)) {
> + bool allow_spin;
> +
> + slab = new_slab(s, gfp, local_node, false, &allow_spin);
> + if (!slab)
> + goto out;
> + stat(s, ALLOC_SLAB);
> + refilled += alloc_whole_from_new_slab(s, slab, p + refilled,
> + allow_spin);
> + } else {
> + slab = new_slab(s, gfp, local_node, true, NULL);
> + if (!slab)
> + goto out;
> + stat(s, ALLOC_SLAB);
> + refilled += alloc_from_new_slab(s, slab, p + refilled,
> + max - refilled,
> + /* allow_spin = */ true);
> + }
>
> if (refilled < min)
> goto new_slab;
> @@ -7587,7 +7704,7 @@ static void early_kmem_cache_node_alloc(int node)
>
> BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
>
> - slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
> + slab = new_slab(kmem_cache_node, GFP_NOWAIT, node, true, NULL);
>
> BUG_ON(!slab);
> if (slab_nid(slab) != node) {
> --
> 2.25.1