[PATCH v4 02/11] futex: Add basic infrastructure for local task local hash.
From: Sebastian Andrzej Siewior
Date: Tue Dec 03 2024 - 11:44:06 EST
The futex hashmap is system wide and shared by random tasks. Each slot
is hashed based on its address and VMA. Due to randomized VMAs (and
memory allocations) the same logical lock (pointer) can end up in a
different hash bucket on each invocation of the application. This in
turn means that different applications may share a hash bucket on the
first invocation but not on the second an it is not always clear which
applications will be involved. This can result in high latency's to
acquire the futex_hash_bucket::lock especially if the lock owner is
limited to a CPU and not be effectively PI boosted.
Introduce a task local hash map. The hashmap can be allocated via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, 0)
The `0' argument allocates a default number of 16 slots, a higher number
can be specified if desired. The current upper limit is 131072.
The allocated hashmap is used by all threads within a process.
A thread can check if the private map has been allocated via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);
Which return the current number of slots.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
include/linux/futex.h | 22 ++++++++
include/linux/mm_types.h | 3 ++
include/uapi/linux/prctl.h | 5 ++
kernel/fork.c | 2 +
kernel/futex/core.c | 100 +++++++++++++++++++++++++++++++++++--
kernel/sys.c | 4 ++
6 files changed, 133 insertions(+), 3 deletions(-)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85c..61e81b866d34e 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -77,6 +77,16 @@ void futex_exec_release(struct task_struct *tsk);
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+int futex_hash_allocate_default(void);
+void futex_hash_free(struct mm_struct *mm);
+
+static inline void futex_mm_init(struct mm_struct *mm)
+{
+ mm->futex_hash_bucket = NULL;
+}
+
#else
static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_exit_recursive(struct task_struct *tsk) { }
@@ -88,6 +98,18 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
{
return -EINVAL;
}
+static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ return -EINVAL;
+}
+static inline int futex_hash_allocate_default(void)
+{
+ return 0;
+}
+static inline void futex_hash_free(struct mm_struct *mm) { }
+static inline void futex_mm_init(struct mm_struct *mm) { }
+
#endif
#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7361a8f3ab68e..b16b97ab8fb2a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -30,6 +30,7 @@
#define INIT_PASID 0
struct address_space;
+struct futex_hash_bucket;
struct mem_cgroup;
/*
@@ -902,6 +903,8 @@ struct mm_struct {
int mm_lock_seq;
#endif
+ unsigned int futex_hash_mask;
+ struct futex_hash_bucket *futex_hash_bucket;
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 5c6080680cb27..55b843644c51a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -353,4 +353,9 @@ struct prctl_mm_map {
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76
+/* FUTEX hash management */
+#define PR_FUTEX_HASH 77
+# define PR_FUTEX_HASH_SET_SLOTS 1
+# define PR_FUTEX_HASH_GET_SLOTS 2
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1450b461d196a..cda8886f3a1d7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1284,6 +1284,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
+ futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
@@ -1361,6 +1362,7 @@ static inline void __mmput(struct mm_struct *mm)
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
+ futex_hash_free(mm);
mmdrop(mm);
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d1d3c7b358b23..fbfe1f1e94505 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -39,6 +39,7 @@
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -107,18 +108,40 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
/**
* futex_hash - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
*
* We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
+ * corresponding hash bucket in the global hash. If the FUTEX is private and
+ * a local hash table is privated then this one is used.
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
+ struct futex_hash_bucket *fhb;
+ u32 hash;
+ fhb = current->mm->futex_hash_bucket;
+ if (fhb && futex_key_is_private(key)) {
+ u32 hash_mask = current->mm->futex_hash_mask;
+
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / 4,
+ key->both.offset);
+ return &fhb[hash & hash_mask];
+ }
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / 4,
+ key->both.offset);
return &futex_queues[hash & (futex_hashsize - 1)];
}
@@ -1131,6 +1154,77 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
spin_lock_init(&fhb->lock);
}
+void futex_hash_free(struct mm_struct *mm)
+{
+ kvfree(mm->futex_hash_bucket);
+}
+
+static int futex_hash_allocate(unsigned int hash_slots)
+{
+ struct futex_hash_bucket *fhb;
+ int i;
+
+ if (current->mm->futex_hash_bucket)
+ return -EALREADY;
+
+ if (!thread_group_leader(current))
+ return -EINVAL;
+
+ if (hash_slots == 0)
+ hash_slots = 16;
+ if (hash_slots < 2)
+ hash_slots = 2;
+ if (hash_slots > 131072)
+ hash_slots = 131072;
+ if (!is_power_of_2(hash_slots))
+ hash_slots = rounddown_pow_of_two(hash_slots);
+
+ fhb = kvmalloc_array(hash_slots, sizeof(struct futex_hash_bucket), GFP_KERNEL_ACCOUNT);
+ if (!fhb)
+ return -ENOMEM;
+
+ current->mm->futex_hash_mask = hash_slots - 1;
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fhb[i]);
+
+ current->mm->futex_hash_bucket = fhb;
+ return 0;
+}
+
+int futex_hash_allocate_default(void)
+{
+ return futex_hash_allocate(0);
+}
+
+static int futex_hash_get_slots(void)
+{
+ if (current->mm->futex_hash_bucket)
+ return current->mm->futex_hash_mask + 1;
+ return 0;
+}
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_SET_SLOTS:
+ ret = futex_hash_allocate(arg3);
+ break;
+
+ case PR_FUTEX_HASH_GET_SLOTS:
+ ret = futex_hash_get_slots();
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
unsigned int futex_shift;
diff --git a/kernel/sys.c b/kernel/sys.c
index c4c701c6f0b4d..dfa8b1b344edb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
+#include <linux/futex.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2809,6 +2810,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_FUTEX_HASH:
+ error = futex_hash_prctl(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
--
2.45.2