[PATCH v4 4/4] slub: apply new pw_queue_on() interface

From: Leonardo Bras

Date: Mon May 18 2026 - 21:31:15 EST

Make use of the new pw_{un,}lock*() and pw_queue_on() interface to improve
performance & latency.

For functions that may be scheduled in a different cpu, replace
local_{un,}lock*() by pw_{un,}lock*(), and replace schedule_work_on() by
pw_queue_on(). The same happens for flush_work() and pw_flush().

This change requires allocation of pw_structs instead of a work_structs,
and changing parameters of a few functions to include the cpu parameter.

This should bring no relevant performance impact on non-PWLOCKS kernels:
For functions that may be scheduled in a different cpu, the local_*lock's
this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()).

Signed-off-by: Leonardo Bras <leobras.c@xxxxxxxxx>
Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
---
mm/slub.c | 142 +++++++++++++++++++++++++++---------------------------
1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 8f9004536729..a154d20e78f7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -43,20 +43,21 @@
#include <linux/prefetch.h>
#include <linux/memcontrol.h>
#include <linux/random.h>
#include <linux/prandom.h>
#include <kunit/test.h>
#include <kunit/test-bug.h>
#include <linux/sort.h>
#include <linux/irq_work.h>
#include <linux/kprobes.h>
#include <linux/debugfs.h>
+#include <linux/pwlocks.h>
#include <trace/events/kmem.h>

#include "internal.h"

/*
* Lock order:
* 0. cpu_hotplug_lock
* 1. slab_mutex (Global Mutex)
* 2a. kmem_cache->cpu_sheaves->lock (Local trylock)
* 2b. barn->lock (Spinlock)
@@ -122,21 +123,21 @@
* (Note that the total number of slabs is an atomic value that may be
* modified without taking the list lock).
*
* The list_lock is a centralized lock and thus we avoid taking it as
* much as possible. As long as SLUB does not have to handle partial
* slabs, operations can continue without any centralized lock.
*
* For debug caches, all allocations are forced to go through a list_lock
* protected region to serialize against concurrent validation.
*
- * cpu_sheaves->lock (local_trylock)
+ * cpu_sheaves->lock (pw_trylock)
*
* This lock protects fastpath operations on the percpu sheaves. On !RT it
* only disables preemption and does no atomic operations. As long as the main
* or spare sheaf can handle the allocation or free, there is no other
* overhead.
*
* barn->lock (spinlock)
*
* This lock protects the operations on per-NUMA-node barn. It can quickly
* serve an empty or full sheaf if available, and avoid more expensive refill
@@ -150,21 +151,21 @@
* cmpxchg_double this is done by a lockless update of slab's freelist and
* counters, otherwise slab_lock is taken. This only needs to take the
* list_lock if it's a first free to a full slab, or when a slab becomes empty
* after the free.
*
* irq, preemption, migration considerations
*
* Interrupts are disabled as part of list_lock or barn lock operations, or
* around the slab_lock operation, in order to make the slab allocator safe
* to use in the context of an irq.
- * Preemption is disabled as part of local_trylock operations.
+ * Preemption is disabled as part of pw_trylock operations.
* kmalloc_nolock() and kfree_nolock() are safe in NMI context but see
* their limitations.
*
* SLUB assigns two object arrays called sheaves for caching allocations and
* frees on each cpu, with a NUMA node shared barn for balancing between cpus.
* Allocations and frees are primarily served from these sheaves.
*
* Slabs with free elements are kept on a partial list and during regular
* operations no list for full slabs is used. If an object in a full slab is
* freed then the slab will show up again on the partial lists.
@@ -411,21 +412,21 @@ struct slab_sheaf {
bool pfmemalloc;
};
};
struct kmem_cache *cache;
unsigned int size;
int node; /* only used for rcu_sheaf */
void *objects[];
};

struct slub_percpu_sheaves {
- local_trylock_t lock;
+ pw_trylock_t lock;
struct slab_sheaf *main; /* never NULL when unlocked */
struct slab_sheaf *spare; /* empty or full, may be NULL */
struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
};

/*
* The slab lists for all objects.
*/
struct kmem_cache_node {
spinlock_t list_lock;
@@ -477,21 +478,21 @@ static nodemask_t slab_nodes;
* Corresponds to N_ONLINE nodes.
*/
static nodemask_t slab_barn_nodes;

/*
* Workqueue used for flushing cpu and kfree_rcu sheaves.
*/
static struct workqueue_struct *flushwq;

struct slub_flush_work {
- struct work_struct work;
+ struct pw_struct pw;
struct kmem_cache *s;
bool skip;
};

static DEFINE_MUTEX(flush_lock);
static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);

/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -2838,74 +2839,74 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
* Free all objects from the main sheaf. In order to perform
* __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
* object pointers are moved to a on-stack array under the lock. To bound the
* stack usage, limit each batch to PCS_BATCH_MAX.
*
* Must be called with s->cpu_sheaves->lock locked, returns with the lock
* unlocked.
*
* Returns how many objects are remaining to be flushed
*/
-static unsigned int __sheaf_flush_main_batch(struct kmem_cache *s)
+static unsigned int __sheaf_flush_main_batch(struct kmem_cache *s, int cpu)
{
struct slub_percpu_sheaves *pcs;
unsigned int batch, remaining;
void *objects[PCS_BATCH_MAX];
struct slab_sheaf *sheaf;

- lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
-
- pcs = this_cpu_ptr(s->cpu_sheaves);
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
sheaf = pcs->main;

batch = min(PCS_BATCH_MAX, sheaf->size);

sheaf->size -= batch;
memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));

remaining = sheaf->size;

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock(&s->cpu_sheaves->lock, cpu);

__kmem_cache_free_bulk(s, batch, &objects[0]);

stat_add(s, SHEAF_FLUSH, batch);

return remaining;
}

-static void sheaf_flush_main(struct kmem_cache *s)
+static void sheaf_flush_main(struct kmem_cache *s, int cpu)
{
unsigned int remaining;

do {
- local_lock(&s->cpu_sheaves->lock);
+ pw_lock(&s->cpu_sheaves->lock, cpu);

- remaining = __sheaf_flush_main_batch(s);
+ remaining = __sheaf_flush_main_batch(s, cpu);

} while (remaining);
}

/*
* Returns true if the main sheaf was at least partially flushed.
*/
static bool sheaf_try_flush_main(struct kmem_cache *s)
{
unsigned int remaining;
bool ret = false;

do {
- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
return ret;

ret = true;
- remaining = __sheaf_flush_main_batch(s);
+
+ pw_lockdep_assert_held(&s->cpu_sheaves->lock);
+ remaining = __sheaf_flush_main_batch(s, smp_processor_id());

} while (remaining);

return ret;
}

/*
* Free all objects from a sheaf that's unused, i.e. not linked to any
* cpu_sheaves, so we need no locking and batching. The locking is also not
* necessary when flushing cpu's sheaves (both spare and main) during cpu
@@ -2968,45 +2969,45 @@ static void rcu_free_sheaf_nobarn(struct rcu_head *head)

/*
* Caller needs to make sure migration is disabled in order to fully flush
* single cpu's sheaves
*
* must not be called from an irq
*
* flushing operations are rare so let's keep it simple and flush to slabs
* directly, skipping the barn
*/
-static void pcs_flush_all(struct kmem_cache *s)
+static void pcs_flush_all(struct kmem_cache *s, int cpu)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *spare, *rcu_free;

- local_lock(&s->cpu_sheaves->lock);
- pcs = this_cpu_ptr(s->cpu_sheaves);
+ pw_lock(&s->cpu_sheaves->lock, cpu);
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);

spare = pcs->spare;
pcs->spare = NULL;

rcu_free = pcs->rcu_free;
pcs->rcu_free = NULL;

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock(&s->cpu_sheaves->lock, cpu);

if (spare) {
sheaf_flush_unused(s, spare);
free_empty_sheaf(s, spare);
}

if (rcu_free)
call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);

- sheaf_flush_main(s);
+ sheaf_flush_main(s, cpu);
}

static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
{
struct slub_percpu_sheaves *pcs;

pcs = per_cpu_ptr(s->cpu_sheaves, cpu);

/* The cpu is not executing anymore so we don't need pcs->lock */
sheaf_flush_unused(s, pcs->main);
@@ -3942,83 +3943,84 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)

/*
* Flush percpu sheaves
*
* Called from CPU work handler with migration disabled.
*/
static void flush_cpu_sheaves(struct work_struct *w)
{
struct kmem_cache *s;
struct slub_flush_work *sfw;
+ int cpu = pw_get_cpu(w);

- sfw = container_of(w, struct slub_flush_work, work);
-
+ sfw = &per_cpu(slub_flush, cpu);
s = sfw->s;

if (cache_has_sheaves(s))
- pcs_flush_all(s);
+ pcs_flush_all(s, cpu);
}

static void flush_all_cpus_locked(struct kmem_cache *s)
{
struct slub_flush_work *sfw;
unsigned int cpu;

lockdep_assert_cpus_held();
mutex_lock(&flush_lock);

for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
if (!has_pcs_used(cpu, s)) {
sfw->skip = true;
continue;
}
- INIT_WORK(&sfw->work, flush_cpu_sheaves);
+ INIT_PW(&sfw->pw, flush_cpu_sheaves, cpu);
sfw->skip = false;
sfw->s = s;
- queue_work_on(cpu, flushwq, &sfw->work);
+ pw_queue_on(cpu, flushwq, &sfw->pw);
}

for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
if (sfw->skip)
continue;
- flush_work(&sfw->work);
+ pw_flush(&sfw->pw);
}

mutex_unlock(&flush_lock);
}

static void flush_all(struct kmem_cache *s)
{
cpus_read_lock();
flush_all_cpus_locked(s);
cpus_read_unlock();
}

static void flush_rcu_sheaf(struct work_struct *w)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *rcu_free;
struct slub_flush_work *sfw;
struct kmem_cache *s;
+ int cpu = pw_get_cpu(w);

- sfw = container_of(w, struct slub_flush_work, work);
+ sfw = &per_cpu(slub_flush, cpu);
s = sfw->s;

- local_lock(&s->cpu_sheaves->lock);
- pcs = this_cpu_ptr(s->cpu_sheaves);
+ pw_lock(&s->cpu_sheaves->lock, cpu);
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);

rcu_free = pcs->rcu_free;
pcs->rcu_free = NULL;

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock(&s->cpu_sheaves->lock, cpu);

if (rcu_free)
call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
}

/* needed for kvfree_rcu_barrier() */
void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
{
struct slub_flush_work *sfw;
@@ -4029,28 +4031,28 @@ void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);

/*
* we don't check if rcu_free sheaf exists - racing
* __kfree_rcu_sheaf() might have just removed it.
* by executing flush_rcu_sheaf() on the cpu we make
* sure the __kfree_rcu_sheaf() finished its call_rcu()
*/

- INIT_WORK(&sfw->work, flush_rcu_sheaf);
+ INIT_PW(&sfw->pw, flush_rcu_sheaf, cpu);
sfw->s = s;
- queue_work_on(cpu, flushwq, &sfw->work);
+ pw_queue_on(cpu, flushwq, &sfw->pw);
}

for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
- flush_work(&sfw->work);
+ pw_flush(&sfw->pw);
}

mutex_unlock(&flush_lock);
}

void flush_all_rcu_sheaves(void)
{
struct kmem_cache *s;

cpus_read_lock();
@@ -4589,36 +4591,36 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
* unlocked.
*/
static struct slub_percpu_sheaves *
__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
{
struct slab_sheaf *empty = NULL;
struct slab_sheaf *full;
struct node_barn *barn;
bool allow_spin;

- lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+ pw_lockdep_assert_held(&s->cpu_sheaves->lock);

/* Bootstrap or debug cache, back off */
if (unlikely(!cache_has_sheaves(s))) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
return NULL;
}

if (pcs->spare && pcs->spare->size > 0) {
swap(pcs->main, pcs->spare);
return pcs;
}

barn = get_barn(s);
if (!barn) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
return NULL;
}

allow_spin = gfpflags_allow_spinning(gfp);

full = barn_replace_empty_sheaf(barn, pcs->main, allow_spin);

if (full) {
stat(s, BARN_GET);
pcs->main = full;
@@ -4629,21 +4631,21 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,

if (allow_spin) {
if (pcs->spare) {
empty = pcs->spare;
pcs->spare = NULL;
} else {
empty = barn_get_empty_sheaf(barn, true);
}
}

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
pcs = NULL;

if (!allow_spin)
return NULL;

if (!empty) {
empty = alloc_empty_sheaf(s, gfp);
if (!empty)
return NULL;
}
@@ -4655,21 +4657,21 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
*/
sheaf_flush_unused(s, empty);
free_empty_sheaf(s, empty);

return NULL;
}

full = empty;
empty = NULL;

- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
goto barn_put;
pcs = this_cpu_ptr(s->cpu_sheaves);

/*
* If we put any empty or full sheaf to the barn below, it's due to
* racing or being migrated to a different cpu. Breaching the barn's
* sheaf limits should be thus rare enough so just ignore them to
* simplify the recovery.
*/

@@ -4733,121 +4735,121 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)

/*
* We assume the percpu sheaves contain only local objects although it's
* not completely guaranteed, so we verify later.
*/
if (unlikely(node_requested && node != numa_mem_id())) {
stat(s, ALLOC_NODE_MISMATCH);
return NULL;
}

- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
return NULL;

pcs = this_cpu_ptr(s->cpu_sheaves);

if (unlikely(pcs->main->size == 0)) {
pcs = __pcs_replace_empty_main(s, pcs, gfp);
if (unlikely(!pcs))
return NULL;
}

object = pcs->main->objects[pcs->main->size - 1];

if (unlikely(node_requested)) {
/*
* Verify that the object was from the node we want. This could
* be false because of cpu migration during an unlocked part of
* the current allocation or previous freeing process.
*/
if (page_to_nid(virt_to_page(object)) != node) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
stat(s, ALLOC_NODE_MISMATCH);
return NULL;
}
}

pcs->main->size--;

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

stat(s, ALLOC_FASTPATH);

return object;
}

static __fastpath_inline
unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size,
void **p)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *main;
unsigned int allocated = 0;
unsigned int batch;

next_batch:
- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
return allocated;

pcs = this_cpu_ptr(s->cpu_sheaves);

if (unlikely(pcs->main->size == 0)) {

struct slab_sheaf *full;
struct node_barn *barn;

if (unlikely(!cache_has_sheaves(s))) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
return allocated;
}

if (pcs->spare && pcs->spare->size > 0) {
swap(pcs->main, pcs->spare);
goto do_alloc;
}

barn = get_barn(s);
if (!barn) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
return allocated;
}

full = barn_replace_empty_sheaf(barn, pcs->main,
gfpflags_allow_spinning(gfp));

if (full) {
stat(s, BARN_GET);
pcs->main = full;
goto do_alloc;
}

stat(s, BARN_GET_FAIL);

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

/*
* Once full sheaves in barn are depleted, let the bulk
* allocation continue from slab pages, otherwise we would just
* be copying arrays of pointers twice.
*/
return allocated;
}

do_alloc:

main = pcs->main;
batch = min(size, main->size);

main->size -= batch;
memcpy(p, main->objects + main->size, batch * sizeof(void *));

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

stat_add(s, ALLOC_FASTPATH, batch);

allocated += batch;

if (batch < size) {
p += batch;
size -= batch;
goto next_batch;
}
@@ -5017,40 +5019,40 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
&sheaf->objects[0])) {
kfree(sheaf);
return NULL;
}

sheaf->size = size;

return sheaf;
}

- local_lock(&s->cpu_sheaves->lock);
+ pw_lock_local(&s->cpu_sheaves->lock);
pcs = this_cpu_ptr(s->cpu_sheaves);

if (pcs->spare) {
sheaf = pcs->spare;
pcs->spare = NULL;
stat(s, SHEAF_PREFILL_FAST);
} else {
barn = get_barn(s);

stat(s, SHEAF_PREFILL_SLOW);
if (barn)
sheaf = barn_get_full_or_empty_sheaf(barn);
if (sheaf && sheaf->size)
stat(s, BARN_GET);
else
stat(s, BARN_GET_FAIL);
}

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

if (!sheaf)
sheaf = alloc_empty_sheaf(s, gfp);

if (sheaf) {
sheaf->capacity = s->sheaf_capacity;
sheaf->pfmemalloc = false;

if (sheaf->size < size &&
@@ -5080,31 +5082,31 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
struct slub_percpu_sheaves *pcs;
struct node_barn *barn;

if (unlikely((sheaf->capacity != s->sheaf_capacity)
|| sheaf->pfmemalloc)) {
sheaf_flush_unused(s, sheaf);
kfree(sheaf);
return;
}

- local_lock(&s->cpu_sheaves->lock);
+ pw_lock_local(&s->cpu_sheaves->lock);
pcs = this_cpu_ptr(s->cpu_sheaves);
barn = get_barn(s);

if (!pcs->spare) {
pcs->spare = sheaf;
sheaf = NULL;
stat(s, SHEAF_RETURN_FAST);
}

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

if (!sheaf)
return;

stat(s, SHEAF_RETURN_SLOW);

/*
* If the barn has too many full sheaves or we fail to refill the sheaf,
* simply flush and free it.
*/
@@ -5627,21 +5629,21 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
* An alternative scenario that gets us here is when we fail
* barn_replace_full_sheaf(), because there's no empty sheaf available in the
* barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
* limit on full sheaves was not exceeded, we assume it didn't change and just
* put the full sheaf there.
*/
static void __pcs_install_empty_sheaf(struct kmem_cache *s,
struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
struct node_barn *barn)
{
- lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+ pw_lockdep_assert_held(&s->cpu_sheaves->lock);

/* This is what we expect to find if nobody interrupted us. */
if (likely(!pcs->spare)) {
pcs->spare = pcs->main;
pcs->main = empty;
return;
}

/*
* Unlikely because if the main sheaf had space, we would have just
@@ -5678,31 +5680,31 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s,
*/
static struct slub_percpu_sheaves *
__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
bool allow_spin)
{
struct slab_sheaf *empty;
struct node_barn *barn;
bool put_fail;

restart:
- lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+ pw_lockdep_assert_held(&s->cpu_sheaves->lock);

/* Bootstrap or debug cache, back off */
if (unlikely(!cache_has_sheaves(s))) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
return NULL;
}

barn = get_barn(s);
if (!barn) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
return NULL;
}

put_fail = false;

if (!pcs->spare) {
empty = barn_get_empty_sheaf(barn, allow_spin);
if (empty) {
pcs->spare = pcs->main;
pcs->main = empty;
@@ -5725,107 +5727,107 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
}

/* sheaf_flush_unused() doesn't support !allow_spin */
if (PTR_ERR(empty) == -E2BIG && allow_spin) {
/* Since we got here, spare exists and is full */
struct slab_sheaf *to_flush = pcs->spare;

stat(s, BARN_PUT_FAIL);

pcs->spare = NULL;
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

sheaf_flush_unused(s, to_flush);
empty = to_flush;
goto got_empty;
}

/*
* We could not replace full sheaf because barn had no empty
* sheaves. We can still allocate it and put the full sheaf in
* __pcs_install_empty_sheaf(), but if we fail to allocate it,
* make sure to count the fail.
*/
put_fail = true;

alloc_empty:
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

/*
* alloc_empty_sheaf() doesn't support !allow_spin and it's
* easier to fall back to freeing directly without sheaves
* than add the support (and to sheaf_flush_unused() above)
*/
if (!allow_spin)
return NULL;

empty = alloc_empty_sheaf(s, GFP_NOWAIT);
if (empty)
goto got_empty;

if (put_fail)
stat(s, BARN_PUT_FAIL);

if (!sheaf_try_flush_main(s))
return NULL;

- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
return NULL;

pcs = this_cpu_ptr(s->cpu_sheaves);

/*
* we flushed the main sheaf so it should be empty now,
* but in case we got preempted or migrated, we need to
* check again
*/
if (pcs->main->size == s->sheaf_capacity)
goto restart;

return pcs;

got_empty:
- if (!local_trylock(&s->cpu_sheaves->lock)) {
+ if (!pw_trylock_local(&s->cpu_sheaves->lock)) {
barn_put_empty_sheaf(barn, empty);
return NULL;
}

pcs = this_cpu_ptr(s->cpu_sheaves);
__pcs_install_empty_sheaf(s, pcs, empty, barn);

return pcs;
}

/*
* Free an object to the percpu sheaves.
* The object is expected to have passed slab_free_hook() already.
*/
static __fastpath_inline
bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin)
{
struct slub_percpu_sheaves *pcs;

- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
return false;

pcs = this_cpu_ptr(s->cpu_sheaves);

if (unlikely(pcs->main->size == s->sheaf_capacity)) {

pcs = __pcs_replace_full_main(s, pcs, allow_spin);
if (unlikely(!pcs))
return false;
}

pcs->main->objects[pcs->main->size++] = object;

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

stat(s, FREE_FASTPATH);

return true;
}

static void rcu_free_sheaf(struct rcu_head *head)
{
struct slab_sheaf *sheaf;
struct node_barn *barn = NULL;
@@ -5898,63 +5900,63 @@ static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG);
bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *rcu_sheaf;

if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
return false;

lock_map_acquire_try(&kfree_rcu_sheaf_map);

- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
goto fail;

pcs = this_cpu_ptr(s->cpu_sheaves);

if (unlikely(!pcs->rcu_free)) {

struct slab_sheaf *empty;
struct node_barn *barn;

/* Bootstrap or debug cache, fall back */
if (unlikely(!cache_has_sheaves(s))) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
goto fail;
}

if (pcs->spare && pcs->spare->size == 0) {
pcs->rcu_free = pcs->spare;
pcs->spare = NULL;
goto do_free;
}

barn = get_barn(s);
if (!barn) {
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);
goto fail;
}

empty = barn_get_empty_sheaf(barn, true);

if (empty) {
pcs->rcu_free = empty;
goto do_free;
}

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

empty = alloc_empty_sheaf(s, GFP_NOWAIT);

if (!empty)
goto fail;

- if (!local_trylock(&s->cpu_sheaves->lock)) {
+ if (!pw_trylock_local(&s->cpu_sheaves->lock)) {
barn_put_empty_sheaf(barn, empty);
goto fail;
}

pcs = this_cpu_ptr(s->cpu_sheaves);

if (unlikely(pcs->rcu_free))
barn_put_empty_sheaf(barn, empty);
else
pcs->rcu_free = empty;
@@ -5971,27 +5973,27 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
rcu_sheaf->objects[rcu_sheaf->size++] = obj;

if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
rcu_sheaf = NULL;
} else {
pcs->rcu_free = NULL;
rcu_sheaf->node = numa_node_id();
}

/*
- * we flush before local_unlock to make sure a racing
+ * we flush before pw_unlock_local to make sure a racing
* flush_all_rcu_sheaves() doesn't miss this sheaf
*/
if (rcu_sheaf)
call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

stat(s, FREE_RCU_SHEAF);
lock_map_release(&kfree_rcu_sheaf_map);
return true;

fail:
stat(s, FREE_RCU_SHEAF_FAIL);
lock_map_release(&kfree_rcu_sheaf_map);
return false;
}
@@ -6082,21 +6084,21 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
continue;
}

i++;
}

if (!size)
goto flush_remote;

next_batch:
- if (!local_trylock(&s->cpu_sheaves->lock))
+ if (!pw_trylock_local(&s->cpu_sheaves->lock))
goto fallback;

pcs = this_cpu_ptr(s->cpu_sheaves);

if (likely(pcs->main->size < s->sheaf_capacity))
goto do_free;

barn = get_barn(s);
if (!barn)
goto no_empty;
@@ -6125,37 +6127,37 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
stat(s, BARN_PUT);
pcs->main = empty;

do_free:
main = pcs->main;
batch = min(size, s->sheaf_capacity - main->size);

memcpy(main->objects + main->size, p, batch * sizeof(void *));
main->size += batch;

- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

stat_add(s, FREE_FASTPATH, batch);

if (batch < size) {
p += batch;
size -= batch;
goto next_batch;
}

if (remote_nr)
goto flush_remote;

return;

no_empty:
- local_unlock(&s->cpu_sheaves->lock);
+ pw_unlock_local(&s->cpu_sheaves->lock);

/*
* if we depleted all empty sheaves in the barn or there are too
* many full sheaves, free the rest to slab pages
*/
fallback:
__kmem_cache_free_bulk(s, size, p);
stat_add(s, FREE_SLOWPATH, size);

flush_remote:
@@ -7554,21 +7556,21 @@ static inline int alloc_kmem_cache_stats(struct kmem_cache *s)
static int init_percpu_sheaves(struct kmem_cache *s)
{
static struct slab_sheaf bootstrap_sheaf = {};
int cpu;

for_each_possible_cpu(cpu) {
struct slub_percpu_sheaves *pcs;

pcs = per_cpu_ptr(s->cpu_sheaves, cpu);

- local_trylock_init(&pcs->lock);
+ pw_trylock_init(&pcs->lock);

/*
* Bootstrap sheaf has zero size so fast-path allocation fails.
* It has also size == s->sheaf_capacity, so fast-path free
* fails. In the slow paths we recognize the situation by
* checking s->sheaf_capacity. This allows fast paths to assume
* s->cpu_sheaves and pcs->main always exists and are valid.
* It's also safe to share the single static bootstrap_sheaf
* with zero-sized objects array as it's never modified.
*
--
2.54.0