Re: [PATCH RFC] futex: avoid false sharing between hb->chain and the bucket lock

From: Peter Zijlstra

Date: Wed Jun 10 2026 - 09:57:16 EST


On Wed, Jun 10, 2026 at 01:25:46PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 10, 2026 at 01:22:34PM +0200, Thomas Gleixner wrote:
> > On Tue, Jun 09 2026 at 22:18, Peter Zijlstra wrote:
> > > On Tue, Jun 09, 2026 at 10:11:17PM +0200, Peter Zijlstra wrote:
> > >> Anyway, how does something like the below work for you? It's a total
> > >> hack job, but it (sorta) builds and runs.
> > >>
> > >
> > > Please use this one, I spotted a silly bug.
> >
> > So I ran this on two machines.
> >
> > SKL dual socket 112 threads:
> >
> > Baseline Patched
> >
> > shared (16k) 1571857 1641435 + 4.4%
> > autosize (512) 646390 903371 +39.7%
> > -b 256 464395 587014 +26.4%
> > -b 512 715687 995943 +39.2%
> > -b 1024 995085 1396328 +40.3%
> > -b 2048 1293114 1668395 +29.0%
> > -b 4096 2124438 2240228 + 5.5%
> >
> > Zen3 dual socket 256 threads:
> >
> > Baseline Patched
> >
> > shared (16k) 1275840 1381279 + 8.2%
> > autosize (512) 1252745 1482179 +18.3%
> > -b 256 856274 955455 +11.5%
> > -b 512 1267490 1544010 +21.8%
> > -b 1024 1424013 1625424 +14.1%
> > -b 2048 1505181 1669342 +10.9%
> > -b 4096 1465993 1688932 +15.2%
>
> I suppose that means I'd better go make it prettier and survive
> randconfig :-)

I ended up with the below. I'll invent a Changelog and throw it to the
robots later tonight.

---
kernel/futex/core.c | 108 +++++++++++++-----------------------------------
kernel/futex/futex.h | 37 ++++++++++-------
kernel/futex/pi.c | 21 +++++-----
kernel/futex/requeue.c | 20 ++++-----
kernel/futex/waitwake.c | 17 +++++---
5 files changed, 84 insertions(+), 119 deletions(-)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ff2a4fb2993f..844fe893646f 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -124,7 +124,7 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */

static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p);

#ifdef CONFIG_FUTEX_PRIVATE_HASH
static bool futex_ref_get(struct futex_private_hash *fph);
@@ -133,15 +133,6 @@ static bool futex_ref_is_dead(struct futex_private_hash *fph);

enum { FR_PERCPU = 0, FR_ATOMIC };

-static inline bool futex_key_is_private(union futex_key *key)
-{
- /*
- * Relies on get_futex_key() to set either bit for shared
- * futexes -- see comment with union futex_key.
- */
- return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
-}
-
static bool futex_private_hash_get(struct futex_private_hash *fph)
{
return futex_ref_get(fph);
@@ -149,48 +140,15 @@ static bool futex_private_hash_get(struct futex_private_hash *fph)

void futex_private_hash_put(struct futex_private_hash *fph)
{
- if (futex_ref_put(fph))
+ if (fph && futex_ref_put(fph))
wake_up_var(fph->mm);
}

-/**
- * futex_hash_get - Get an additional reference for the local hash.
- * @hb: ptr to the private local hash.
- *
- * Obtain an additional reference for the already obtained hash bucket. The
- * caller must already own an reference.
- */
-void futex_hash_get(struct futex_hash_bucket *hb)
-{
- struct futex_private_hash *fph = hb->priv;
-
- if (!fph)
- return;
- WARN_ON_ONCE(!futex_private_hash_get(fph));
-}
-
-void futex_hash_put(struct futex_hash_bucket *hb)
-{
- struct futex_private_hash *fph = hb->priv;
-
- if (!fph)
- return;
- futex_private_hash_put(fph);
-}
-
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
u32 hash;

- if (!futex_key_is_private(key))
- return NULL;
-
- if (!fph)
- fph = rcu_dereference(key->private.mm->futex_phash);
- if (!fph || !fph->hash_mask)
- return NULL;
-
hash = jhash2((void *)&key->private.address,
sizeof(key->private.address) / 4,
key->both.offset);
@@ -211,13 +169,12 @@ static void futex_rehash_private(struct futex_private_hash *old,

spin_lock(&hb_old->lock);
plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
-
plist_del(&this->list, &hb_old->chain);
futex_hb_waiters_dec(hb_old);

WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);

- hb_new = __futex_hash(&this->key, new);
+ hb_new = __futex_hash(&this->key, new, NULL);
futex_hb_waiters_inc(hb_new);
/*
* The new pointer isn't published yet but an already
@@ -271,9 +228,8 @@ static void futex_pivot_hash(struct mm_struct *mm)
}
}

-struct futex_private_hash *futex_private_hash(void)
+struct futex_private_hash *futex_private_hash(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
/*
* Ideally we don't loop. If there is a replacement in progress
* then a new private hash is already prepared and a reference can't be
@@ -299,18 +255,17 @@ struct futex_private_hash *futex_private_hash(void)
goto again;
}

-struct futex_hash_bucket *futex_hash(union futex_key *key)
+struct futex_bucket_ref futex_hash(union futex_key *key)
{
- struct futex_private_hash *fph;
- struct futex_hash_bucket *hb;
-
again:
scoped_guard(rcu) {
- hb = __futex_hash(key, NULL);
- fph = hb->priv;
+ struct futex_private_hash *fph = NULL;
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash(key, NULL, &fph);

if (!fph || futex_private_hash_get(fph))
- return hb;
+ return (struct futex_bucket_ref){ .hb = hb, .fph = fph };
}
futex_pivot_hash(key->private.mm);
goto again;
@@ -318,15 +273,9 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)

#else /* !CONFIG_FUTEX_PRIVATE_HASH */

-static struct futex_hash_bucket *
-__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+struct futex_bucket_ref futex_hash(union futex_key *key)
{
- return NULL;
-}
-
-struct futex_hash_bucket *futex_hash(union futex_key *key)
-{
- return __futex_hash(key, NULL);
+ return (struct futex_bucket_ref){ .hb = __futex_hash(key, NULL, NULL), .fph = NULL };
}

#endif /* CONFIG_FUTEX_PRIVATE_HASH */
@@ -412,18 +361,22 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
* global hash is returned.
*/
static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p)
{
int node = key->both.node;
u32 hash;

- if (node == FUTEX_NO_NODE) {
- struct futex_hash_bucket *hb;
-
- hb = __futex_hash_private(key, fph);
- if (hb)
- return hb;
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ if (node == FUTEX_NO_NODE && futex_key_is_private(key)) {
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex_phash);
+ if (fph && fph->hash_mask) {
+ if (fph_p)
+ *fph_p = fph;
+ return __futex_hash_private(key, fph);
+ }
}
+#endif

hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / sizeof(u32),
@@ -1336,7 +1289,7 @@ static void exit_pi_state_list(struct task_struct *curr)
* on the mutex.
*/
WARN_ON(curr != current);
- guard(private_hash)();
+ guard(private_hash)(current->mm);
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1348,7 +1301,8 @@ static void exit_pi_state_list(struct task_struct *curr)
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
if (1) {
- CLASS(hb, hb)(&key);
+ CLASS(hbr, hbr)(&key);
+ auto hb = hbr.hb;

/*
* We can race against put_pi_state() removing itself from the
@@ -1516,12 +1470,8 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}

-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
- struct futex_private_hash *fph)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
{
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
- fhb->priv = fph;
-#endif
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
@@ -1822,7 +1772,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
fph->mm = mm;

for (i = 0; i < hash_slots; i++)
- futex_hash_bucket_init(&fph->queues[i], fph);
+ futex_hash_bucket_init(&fph->queues[i]);

if (custom) {
/*
@@ -2001,7 +1951,7 @@ static int __init futex_init(void)
BUG_ON(!table);

for (i = 0; i < hashsize; i++)
- futex_hash_bucket_init(&table[i], NULL);
+ futex_hash_bucket_init(&table[i]);

futex_queues[n] = table;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 9f6bf6f585fc..7aec5e85e039 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -126,6 +126,15 @@ static inline bool should_fail_futex(bool fshared)
}
#endif

+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
@@ -135,7 +144,6 @@ struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
- struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;

/*
@@ -175,7 +183,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
* @requeue_state: State field for futex_requeue_pi()
- * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
+ * @drop_fph: Waiter should drop the extra private hash reference when set
* @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -202,7 +210,7 @@ struct futex_q {
union futex_key *requeue_pi_key;
u32 bitset;
atomic_t requeue_state;
- bool drop_hb_ref;
+ struct futex_private_hash *drop_fph;
#ifdef CONFIG_PREEMPT_RT
struct rcuwait requeue_wait;
#endif
@@ -222,28 +230,29 @@ extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);

-extern struct futex_hash_bucket *futex_hash(union futex_key *key);
-#ifdef CONFIG_FUTEX_PRIVATE_HASH
-extern void futex_hash_get(struct futex_hash_bucket *hb);
-extern void futex_hash_put(struct futex_hash_bucket *hb);
+struct futex_bucket_ref {
+ struct futex_hash_bucket *hb;
+ struct futex_private_hash *fph;
+};

-extern struct futex_private_hash *futex_private_hash(void);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern struct futex_private_hash *futex_private_hash(struct mm_struct *mm);
extern void futex_private_hash_put(struct futex_private_hash *fph);

#else /* !CONFIG_FUTEX_PRIVATE_HASH */
-static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
-static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
-static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline struct futex_private_hash *futex_private_hash(struct mm_struct *mm) { return NULL; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif

-DEFINE_CLASS(hb, struct futex_hash_bucket *,
- if (_T) futex_hash_put(_T),
+extern struct futex_bucket_ref futex_hash(union futex_key *key);
+
+DEFINE_CLASS(hbr, struct futex_bucket_ref,
+ if (_T.fph) futex_private_hash_put(_T.fph),
futex_hash(key), union futex_key *key);

DEFINE_CLASS(private_hash, struct futex_private_hash *,
if (_T) futex_private_hash_put(_T),
- futex_private_hash(), void);
+ futex_private_hash(mm), struct mm_struct *mm);

/**
* futex_match - Check whether two futex keys are equal
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 643199fdbe62..acc8f715c7da 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -945,7 +945,8 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl

retry_private:
if (1) {
- CLASS(hb, hb)(&q.key);
+ CLASS(hbr, hbr)(&q.key);
+ auto hb = hbr.hb;

futex_q_lock(&q, hb);

@@ -1009,7 +1010,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
* the thread, performing resize, will block on hb->lock during
* the requeue.
*/
- futex_hash_put(no_free_ptr(hb));
+ futex_private_hash_put(no_free_ptr(hbr.fph));
/*
* Must be done before we enqueue the waiter, here is unfortunately
* under the hb lock, but that *should* work because it does nothing.
@@ -1100,11 +1101,9 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
__release(&hb->lock);
futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr);
- if (q.drop_hb_ref) {
- CLASS(hb, hb)(&q.key);
- /* Additional reference from futex_unlock_pi() */
- futex_hash_put(hb);
- }
+
+ /* Additional reference from futex_unlock_pi() */
+ futex_private_hash_put(q.drop_fph);
goto out;

out_unlock_put_key:
@@ -1162,7 +1161,8 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if (ret)
return ret;

- CLASS(hb, hb)(&key);
+ CLASS(hbr, hbr)(&key);
+ auto hb = hbr.hb;
spin_lock(&hb->lock);
retry_hb:

@@ -1219,8 +1219,9 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* Acquire a reference for the leaving waiter to ensure
* valid futex_q::lock_ptr.
*/
- futex_hash_get(hb);
- top_waiter->drop_hb_ref = true;
+ if (futex_key_is_private(&key))
+ top_waiter->drop_fph = futex_private_hash(key.private.mm);
+
__futex_unqueue(top_waiter);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
goto retry_hb;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 1d99a84dc9ad..7384672916fb 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -241,8 +241,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* Acquire a reference for the waiter to ensure valid
* futex_q::lock_ptr.
*/
- futex_hash_get(hb);
- q->drop_hb_ref = true;
+ if (futex_key_is_private(key))
+ q->drop_fph = futex_private_hash(key->private.mm);
q->lock_ptr = &hb->lock;
task = READ_ONCE(q->task);

@@ -459,8 +459,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,

retry_private:
if (1) {
- CLASS(hb, hb1)(&key1);
- CLASS(hb, hb2)(&key2);
+ CLASS(hbr, hbr1)(&key1);
+ CLASS(hbr, hbr2)(&key2);
+ auto hb1 = hbr1.hb;
+ auto hb2 = hbr2.hb;

futex_hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
@@ -838,7 +840,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
{
- CLASS(hb, hb)(&q.key);
+ CLASS(hbr, hbr)(&q.key);
+ auto hb = hbr.hb;
/* The waiter is still on uaddr1 */
spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, to);
@@ -908,11 +911,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
default:
BUG();
}
- if (q.drop_hb_ref) {
- CLASS(hb, hb)(&q.key);
- /* Additional reference from requeue_pi_wake_futex() */
- futex_hash_put(hb);
- }
+ /* Additional reference from requeue_pi_wake_futex() */
+ futex_private_hash_put(q.drop_fph);

out:
if (to) {
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ceed9d879059..1f15fff4ada5 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -169,7 +169,8 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if ((flags & FLAGS_STRICT) && !nr_wake)
return 0;

- CLASS(hb, hb)(&key);
+ CLASS(hbr, hbr)(&key);
+ auto hb = hbr.hb;

/* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb))
@@ -266,8 +267,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,

retry_private:
if (1) {
- CLASS(hb, hb1)(&key1);
- CLASS(hb, hb2)(&key2);
+ CLASS(hbr, hbr1)(&key1);
+ CLASS(hbr, hbr2)(&key2);
+ auto hb1 = hbr1.hb;
+ auto hb2 = hbr2.hb;

double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
@@ -409,7 +412,7 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
* Make sure to have a reference on the private_hash such that we
* don't block on rehash after changing the task state below.
*/
- guard(private_hash)();
+ guard(private_hash)(current->mm);

/*
* Enqueuing multiple futexes is tricky, because we need to enqueue
@@ -446,7 +449,8 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
u32 val = vs[i].w.val;

if (1) {
- CLASS(hb, hb)(&q->key);
+ CLASS(hbr, hbr)(&q->key);
+ auto hb = hbr.hb;

futex_q_lock(q, hb);
ret = futex_get_value_locked(&uval, uaddr);
@@ -621,7 +625,8 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,

retry_private:
if (1) {
- CLASS(hb, hb)(&q->key);
+ CLASS(hbr, hbr)(&q->key);
+ auto hb = hbr.hb;

futex_q_lock(q, hb);