Re: [RFC PATCH] futex: Dynamically allocate futex_queues depending on nr_node_ids

From: Peter Zijlstra

Date: Fri Feb 27 2026 - 11:23:51 EST


On Fri, Feb 27, 2026 at 08:29:03PM +0530, K Prateek Nayak wrote:

> > Both will result in at least one extra deref/cacheline for each futex
> > op, no?
>
> Ack but I was wondering if that penalty can be offset by the fact that
> we no longer need to look at "nr_node_ids" in a separate cacheline?
>
> I ran futex bench enough time before posting to come to conclusion that
> there isn't any noticeable regression - the numbers swung either ways
> and I just took one set for comparison.
>
> Sebastian and I have been having a more philosophical discussion on that
> CONFIG_NODES_SHIFT default but I guess as far as this patch is concerned,
> the conclusion is we want to avoid an extra dereference in the fast-path
> at the cost of a little bit extra space?

Ooh, I just remebered, I've always wanted to apply Linus' runtime-const
stuff to the futex thing.

Something like the below. But I'm not sure if it actually makes a
difference these days :/

But that can surely fix up the extra deref.

---
diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h
index e5a13dc8816e..b356b3ae8d3b 100644
--- a/arch/x86/include/asm/runtime-const.h
+++ b/arch/x86/include/asm/runtime-const.h
@@ -41,6 +41,15 @@
:"+r" (__ret)); \
__ret; })

+#define runtime_const_mask_32(val, sym) ({ \
+ typeof(0u+(val)) __ret = (val); \
+ asm_inline("and $0x12345678, %k0\n1:\n" \
+ ".pushsection runtime_mask_" #sym ",\"a\"\n\t"\
+ ".long 1b - 4 - .\n" \
+ ".popsection" \
+ : "+r" (__ret)); \
+ __ret; })
+
#define runtime_const_init(type, sym) do { \
extern s32 __start_runtime_##type##_##sym[]; \
extern s32 __stop_runtime_##type##_##sym[]; \
@@ -65,6 +74,11 @@ static inline void __runtime_fixup_shift(void *where, unsigned long val)
*(unsigned char *)where = val;
}

+static inline void __runtime_fixup_mask(void *where, unsigned long val)
+{
+ *(unsigned int *)where = val;
+}
+
static inline void runtime_const_fixup(void (*fn)(void *, unsigned long),
unsigned long val, s32 *start, s32 *end)
{
diff --git a/include/asm-generic/runtime-const.h b/include/asm-generic/runtime-const.h
index 670499459514..03e6e3e02401 100644
--- a/include/asm-generic/runtime-const.h
+++ b/include/asm-generic/runtime-const.h
@@ -10,6 +10,7 @@
*/
#define runtime_const_ptr(sym) (sym)
#define runtime_const_shift_right_32(val, sym) ((u32)(val)>>(sym))
+#define runtime_const_mask_32(val, sym) ((u32)(val)&(sym))
#define runtime_const_init(type,sym) do { } while (0)

#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index eeb070f330bd..c3acaa94b970 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -973,7 +973,10 @@
RUNTIME_CONST(shift, d_hash_shift) \
RUNTIME_CONST(ptr, dentry_hashtable) \
RUNTIME_CONST(ptr, __dentry_cache) \
- RUNTIME_CONST(ptr, __names_cache)
+ RUNTIME_CONST(ptr, __names_cache) \
+ RUNTIME_CONST(shift, __futex_shift) \
+ RUNTIME_CONST(mask, __futex_mask) \
+ RUNTIME_CONST(ptr, __futex_queues)

/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
#define KUNIT_TABLE() \
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cf7e610eac42..8b58d9035e3a 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -54,14 +54,17 @@
* reside in the same cacheline.
*/
static struct {
- unsigned long hashmask;
- unsigned int hashshift;
struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));

-#define futex_hashmask (__futex_data.hashmask)
-#define futex_hashshift (__futex_data.hashshift)
-#define futex_queues (__futex_data.queues)
+static u32 __futex_mask;
+static u32 __futex_shift;
+static struct futex_hash_bucket **__futex_queues;
+
+static __always_inline struct futex_hash_bucket **futex_queues(void)
+{
+ return runtime_const_ptr(__futex_queues);
+}

struct futex_private_hash {
int state;
@@ -439,14 +442,14 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
* NOTE: this isn't perfectly uniform, but it is fast and
* handles sparse node masks.
*/
- node = (hash >> futex_hashshift) % nr_node_ids;
+ node = runtime_const_shift_right_32(hash, __futex_shift) % nr_node_ids;
if (!node_possible(node)) {
node = find_next_bit_wrap(node_possible_map.bits,
nr_node_ids, node);
}
}

- return &futex_queues[node][hash & futex_hashmask];
+ return &futex_queues()[node][runtime_const_mask_32(hash, __futex_mask)];
}

/**
@@ -1913,7 +1916,7 @@ int futex_hash_allocate_default(void)
* 16 <= threads * 4 <= global hash size
*/
buckets = roundup_pow_of_two(4 * threads);
- buckets = clamp(buckets, 16, futex_hashmask + 1);
+ buckets = clamp(buckets, 16, __futex_mask + 1);

if (current_buckets >= buckets)
return 0;
@@ -1983,10 +1986,17 @@ static int __init futex_init(void)
hashsize = max(4, hashsize);
hashsize = roundup_pow_of_two(hashsize);
#endif
- futex_hashshift = ilog2(hashsize);
+ __futex_mask = hashsize - 1;
+ __futex_shift = ilog2(hashsize);
size = sizeof(struct futex_hash_bucket) * hashsize;
order = get_order(size);

+ void *__futex_queues = &__futex_data.queues;
+
+ runtime_const_init(shift, __futex_shift);
+ runtime_const_init(mask, __futex_mask);
+ runtime_const_init(ptr, __futex_queues);
+
for_each_node(n) {
struct futex_hash_bucket *table;

@@ -2000,10 +2010,9 @@ static int __init futex_init(void)
for (i = 0; i < hashsize; i++)
futex_hash_bucket_init(&table[i], NULL);

- futex_queues[n] = table;
+ futex_queues()[n] = table;
}

- futex_hashmask = hashsize - 1;
pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
order > MAX_PAGE_ORDER ? "vmalloc" : "linear");