Re: [PATCH v11 00/19] futex: Add support task local hash maps, FUTEX2_NUMA and FUTEX2_MPOL

From: Sebastian Andrzej Siewior
Date: Mon Apr 07 2025 - 12:12:39 EST


On 2025-04-07 17:57:23 [+0200], To linux-kernel@xxxxxxxxxxxxxxx wrote:
> This is the local hash map series with PeterZ FUTEX2_NUMA and
> FUTEX2_MPOL plus a few fixes on top.

> v10…v11: https://lore.kernel.org/all/20250312151634.2183278-1-bigeasy@xxxxxxxxxxxxx
> - PeterZ' fixups, changes to the local hash series have been folded
> into the earlier patches so things are not added and renamed later
> and the functionality is changed.

for easier comparison, here is a diff vs v10 excluding patches v17+

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 19c37afa0432a..ee48dcfbfe59d 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,11 +4,11 @@

#include <linux/sched.h>
#include <linux/ktime.h>
+#include <linux/mm_types.h>

#include <uapi/linux/futex.h>

struct inode;
-struct mm_struct;
struct task_struct;

/*
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 09c3e3e33f1f8..de95794777ad6 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -168,12 +168,14 @@ void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_m
int node, const void *caller) __alloc_size(1);
#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))

-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
-#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__))
-
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1);
#define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__))

+static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+{
+ return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE);
+}
+
extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 55b843644c51a..2344c1feaa4e3 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -354,7 +354,7 @@ struct prctl_mm_map {
#define PR_LOCK_SHADOW_STACK_STATUS 76

/* FUTEX hash management */
-#define PR_FUTEX_HASH 77
+#define PR_FUTEX_HASH 78
# define PR_FUTEX_HASH_SET_SLOTS 1
# define PR_FUTEX_HASH_GET_SLOTS 2

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 65523f3cfe32e..2f2a92c791def 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -124,6 +124,10 @@ late_initcall(fail_futex_debugfs);

#endif /* CONFIG_FAIL_FUTEX */

+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
static inline bool futex_key_is_private(union futex_key *key)
{
/*
@@ -133,10 +137,43 @@ static inline bool futex_key_is_private(union futex_key *key)
return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
}

-static struct futex_hash_bucket *
-__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+bool futex_private_hash_get(struct futex_private_hash *fph)
+{
+ return rcuref_get(&fph->users);
+}
+
+void futex_private_hash_put(struct futex_private_hash *fph)
+{
+ /* Ignore return value, last put is verified via rcuref_is_dead() */
+ if (rcuref_put(&fph->users))
+ wake_up_var(fph->mm);
+}
+
+/**
+ * futex_hash_get - Get an additional reference for the local hash.
+ * @hb: ptr to the private local hash.
+ *
+ * Obtain an additional reference for the already obtained hash bucket. The
+ * caller must already own an reference.
+ */
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ futex_private_hash_put(fph);
+}

-#ifdef CONFIG_FUTEX_PRIVATE_HASH
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
@@ -210,13 +247,12 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
}
rcu_assign_pointer(mm->futex_phash, new);
kvfree_rcu(fph, rcu);
- wake_up_var(mm);
return true;
}

static void futex_pivot_hash(struct mm_struct *mm)
{
- scoped_guard (mutex, &mm->futex_hash_lock) {
+ scoped_guard(mutex, &mm->futex_hash_lock) {
struct futex_private_hash *fph;

fph = mm->futex_phash_new;
@@ -255,28 +291,13 @@ struct futex_private_hash *futex_private_hash(void)
goto again;
}

-bool futex_private_hash_get(struct futex_private_hash *fph)
-{
- return rcuref_get(&fph->users);
-}
-
-void futex_private_hash_put(struct futex_private_hash *fph)
-{
- /*
- * Ignore the result; the DEAD state is picked up
- * when rcuref_get() starts failing via rcuref_is_dead().
- */
- if (rcuref_put(&fph->users))
- wake_up_var(fph->mm);
-}
-
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
struct futex_private_hash *fph;
struct futex_hash_bucket *hb;

again:
- scoped_guard (rcu) {
+ scoped_guard(rcu) {
hb = __futex_hash(key, NULL);
fph = hb->priv;

@@ -287,27 +308,9 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
goto again;
}

-void futex_hash_get(struct futex_hash_bucket *hb)
-{
- struct futex_private_hash *fph = hb->priv;
-
- if (!fph)
- return;
- WARN_ON_ONCE(!futex_private_hash_get(fph));
-}
-
-void futex_hash_put(struct futex_hash_bucket *hb)
-{
- struct futex_private_hash *fph = hb->priv;
-
- if (!fph)
- return;
- futex_private_hash_put(fph);
-}
-
#else /* !CONFIG_FUTEX_PRIVATE_HASH */

-static inline struct futex_hash_bucket *
+static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
return NULL;
@@ -388,12 +391,15 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_FUTEX_MPOL */

/**
- * futex_hash - Return the hash bucket in the global hash
+ * __futex_hash - Return the hash bucket
* @key: Pointer to the futex key for which the hash is calculated
+ * @fph: Pointer to private hash if known
*
* We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash. If the FUTEX is private and
- * a local hash table is privated then this one is used.
+ * corresponding hash bucket.
+ * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
+ * private hash) is returned if existing. Otherwise a hash bucket from the
+ * global hash is returned.
*/
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph)
@@ -1522,10 +1528,10 @@ static bool futex_pivot_pending(struct mm_struct *mm)
guard(rcu)();

if (!mm->futex_phash_new)
- return false;
+ return true;

fph = rcu_dereference(mm->futex_phash);
- return !rcuref_read(&fph->users);
+ return rcuref_is_dead(&fph->users);
}

static bool futex_hash_less(struct futex_private_hash *a,
@@ -1564,7 +1570,7 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
/*
* Once we've disabled the global hash there is no way back.
*/
- scoped_guard (rcu) {
+ scoped_guard(rcu) {
fph = rcu_dereference(mm->futex_phash);
if (fph && !fph->hash_mask) {
if (custom)
@@ -1631,7 +1637,7 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
if (new) {
/*
* Will set mm->futex_phash_new on failure;
- * futex_get_private_hash() will try again.
+ * futex_private_hash_get() will try again.
*/
if (!__futex_pivot_hash(mm, new) && custom)
goto again;
@@ -1681,7 +1687,7 @@ static int futex_hash_get_slots(void)

guard(rcu)();
fph = rcu_dereference(current->mm->futex_phash);
- if (fph)
+ if (fph && fph->hash_mask)
return fph->hash_mask + 1;
return 0;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 52e9c0c4b6c87..004e4dbee4f93 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -222,7 +222,6 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);

extern struct futex_hash_bucket *futex_hash(union futex_key *key);
-
#ifdef CONFIG_FUTEX_PRIVATE_HASH
extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
@@ -234,15 +233,8 @@ extern void futex_private_hash_put(struct futex_private_hash *fph);
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
-
-static inline struct futex_private_hash *futex_private_hash(void)
-{
- return NULL;
-}
-static inline bool futex_private_hash_get(struct futex_private_hash *fph)
-{
- return false;
-}
+static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline bool futex_private_hash_get(void) { return false; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif

diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 51c69e8808152..356e52c17d3c5 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1042,7 +1042,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
cleanup:
/*
* If we failed to acquire the lock (deadlock/signal/timeout), we must
- * must unwind the above, however we canont lock hb->lock because
+ * unwind the above, however we canont lock hb->lock because
* rt_mutex already has a waiter enqueued and hb->lock can itself try
* and enqueue an rt_waiter through rtlock.
*
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 74647f6bf75de..bd8fef0f8d180 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -297,7 +297,7 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
}

plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (futex_match (&this->key, &key1)) {
+ if (futex_match(&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
@@ -311,7 +311,7 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
if (op_ret > 0) {
op_ret = 0;
plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (futex_match (&this->key, &key2)) {
+ if (futex_match(&this->key, &key2)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
@@ -385,7 +385,7 @@ int futex_unqueue_multiple(struct futex_vector *v, int count)
}

/**
- * __futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
* @vs: The futex list to wait on
* @count: The size of the list
* @woken: Index of the last woken futex, if any. Used to notify the
diff --git a/mm/nommu.c b/mm/nommu.c
index d04e601a8f4d7..aed58ea7398db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -207,11 +207,22 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);

-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
-
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
- return vmalloc_huge_noprof(size, gfp_mask);
+ return __vmalloc_noprof(size, gfp_mask);
}

/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69247b46413ca..0e2c49aaf84f1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3947,9 +3947,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);

/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3958,20 +3959,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
-{
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
-
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);

/**
* vzalloc - allocate virtually contiguous memory with zero fill