Re: [PATCH RFC] futex: avoid false sharing between hb->chain and the bucket lock

From: Peter Zijlstra

Date: Tue Jun 09 2026 - 16:13:28 EST


On Tue, Jun 09, 2026 at 08:28:16AM -0700, Breno Leitao wrote:

> > I'm really not sure, it *doubles* the futex memory cost.
>
> I think it's worth the trade. The global hash scales linearly with
> num_possible_cpus(), so the extra bytes track the same curve as the machines
> that actually need the fix
>
> in simpler words, a box big enough to feel this contention has plenty of RAM
> headroom to absorb it.

You might not have heard, but RAM has gotten ludicrously expensive.

Anyway, how does something like the below work for you? It's a total
hack job, but it (sorta) builds and runs.


---
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ff2a4fb2993f..8555c76077af 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -124,7 +124,7 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */

static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p);

#ifdef CONFIG_FUTEX_PRIVATE_HASH
static bool futex_ref_get(struct futex_private_hash *fph);
@@ -183,14 +183,6 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
u32 hash;

- if (!futex_key_is_private(key))
- return NULL;
-
- if (!fph)
- fph = rcu_dereference(key->private.mm->futex_phash);
- if (!fph || !fph->hash_mask)
- return NULL;
-
hash = jhash2((void *)&key->private.address,
sizeof(key->private.address) / 4,
key->both.offset);
@@ -211,13 +203,12 @@ static void futex_rehash_private(struct futex_private_hash *old,

spin_lock(&hb_old->lock);
plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
-
plist_del(&this->list, &hb_old->chain);
futex_hb_waiters_dec(hb_old);

WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);

- hb_new = __futex_hash(&this->key, new);
+ hb_new = __futex_hash(&this->key, new, NULL);
futex_hb_waiters_inc(hb_new);
/*
* The new pointer isn't published yet but an already
@@ -299,18 +290,17 @@ struct futex_private_hash *futex_private_hash(void)
goto again;
}

-struct futex_hash_bucket *futex_hash(union futex_key *key)
+struct futex_bucket_ref futex_hash(union futex_key *key)
{
- struct futex_private_hash *fph;
+ struct futex_private_hash *fph = NULL;
struct futex_hash_bucket *hb;

again:
scoped_guard(rcu) {
- hb = __futex_hash(key, NULL);
- fph = hb->priv;
+ hb = __futex_hash(key, NULL, &fph);

if (!fph || futex_private_hash_get(fph))
- return hb;
+ return (struct futex_bucket_ref){ .hb = hb, .fph = fph };
}
futex_pivot_hash(key->private.mm);
goto again;
@@ -412,17 +402,19 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
* global hash is returned.
*/
static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p)
{
int node = key->both.node;
u32 hash;

- if (node == FUTEX_NO_NODE) {
- struct futex_hash_bucket *hb;
-
- hb = __futex_hash_private(key, fph);
- if (hb)
- return hb;
+ if (node == FUTEX_NO_NODE && futex_key_is_private(key)) {
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex_phash);
+ if (fph && fph->hash_mask) {
+ if (fph_p)
+ *fph_p = fph;
+ return __futex_hash_private(key, fph);
+ }
}

hash = jhash2((u32 *)key,
@@ -1348,7 +1340,8 @@ static void exit_pi_state_list(struct task_struct *curr)
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
if (1) {
- CLASS(hb, hb)(&key);
+ CLASS(hb, hbr)(&key);
+ struct futex_hash_bucket *hb = hbr.hb;

/*
* We can race against put_pi_state() removing itself from the
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 9f6bf6f585fc..37fc944edeb9 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -2,6 +2,7 @@
#ifndef _FUTEX_H
#define _FUTEX_H

+#include "linux/mm_types.h"
#include <linux/futex.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
@@ -222,7 +223,6 @@ extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);

-extern struct futex_hash_bucket *futex_hash(union futex_key *key);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
@@ -237,8 +237,15 @@ static inline struct futex_private_hash *futex_private_hash(void) { return NULL;
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif

-DEFINE_CLASS(hb, struct futex_hash_bucket *,
- if (_T) futex_hash_put(_T),
+struct futex_bucket_ref {
+ struct futex_hash_bucket *hb;
+ struct futex_private_hash *fph;
+};
+
+extern struct futex_bucket_ref futex_hash(union futex_key *key);
+
+DEFINE_CLASS(hb, struct futex_bucket_ref,
+ if (_T.fph) futex_private_hash_put(_T.fph),
futex_hash(key), union futex_key *key);

DEFINE_CLASS(private_hash, struct futex_private_hash *,
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 643199fdbe62..5c227a4d963d 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -945,7 +945,8 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl

retry_private:
if (1) {
- CLASS(hb, hb)(&q.key);
+ CLASS(hb, hbr)(&q.key);
+ struct futex_hash_bucket *hb = hbr.hb;

futex_q_lock(&q, hb);

@@ -1101,9 +1102,9 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr);
if (q.drop_hb_ref) {
- CLASS(hb, hb)(&q.key);
+ CLASS(hb, hbr)(&q.key);
/* Additional reference from futex_unlock_pi() */
- futex_hash_put(hb);
+ futex_hash_put(hbr.hb);
}
goto out;

@@ -1162,7 +1163,8 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if (ret)
return ret;

- CLASS(hb, hb)(&key);
+ CLASS(hb, hbr)(&key);
+ struct futex_hash_bucket *hb = hbr.hb;
spin_lock(&hb->lock);
retry_hb:

diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 1d99a84dc9ad..8ae99b7cb873 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -459,8 +459,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,

retry_private:
if (1) {
- CLASS(hb, hb1)(&key1);
- CLASS(hb, hb2)(&key2);
+ CLASS(hb, hbr1)(&key1);
+ CLASS(hb, hbr2)(&key2);
+ struct futex_hash_bucket *hb1 = hbr1.hb;
+ struct futex_hash_bucket *hb2 = hbr2.hb;

futex_hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
@@ -838,7 +840,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
{
- CLASS(hb, hb)(&q.key);
+ CLASS(hb, hbr)(&q.key);
+ struct futex_hash_bucket *hb = hbr.hb;
/* The waiter is still on uaddr1 */
spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, to);
@@ -909,9 +912,9 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
BUG();
}
if (q.drop_hb_ref) {
- CLASS(hb, hb)(&q.key);
+ CLASS(hb, hbr)(&q.key);
/* Additional reference from requeue_pi_wake_futex() */
- futex_hash_put(hb);
+ futex_hash_put(hbr.hb);
}

out:
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ceed9d879059..8c8e3ae899cb 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -169,7 +169,8 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if ((flags & FLAGS_STRICT) && !nr_wake)
return 0;

- CLASS(hb, hb)(&key);
+ CLASS(hb, hbr)(&key);
+ struct futex_hash_bucket *hb = hbr.hb;

/* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb))
@@ -266,8 +267,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,

retry_private:
if (1) {
- CLASS(hb, hb1)(&key1);
- CLASS(hb, hb2)(&key2);
+ CLASS(hb, hbr1)(&key1);
+ CLASS(hb, hbr2)(&key2);
+ struct futex_hash_bucket *hb1 = hbr1.hb;
+ struct futex_hash_bucket *hb2 = hbr2.hb;

double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
@@ -446,7 +449,8 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
u32 val = vs[i].w.val;

if (1) {
- CLASS(hb, hb)(&q->key);
+ CLASS(hb, hbr)(&q->key);
+ struct futex_hash_bucket *hb = hbr.hb;

futex_q_lock(q, hb);
ret = futex_get_value_locked(&uval, uaddr);
@@ -621,7 +625,8 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,

retry_private:
if (1) {
- CLASS(hb, hb)(&q->key);
+ CLASS(hb, hbr)(&q->key);
+ struct futex_hash_bucket *hb = hbr.hb;

futex_q_lock(q, hb);