[PATCH v4 05/11] futex: Track the futex hash bucket.
From: Sebastian Andrzej Siewior
Date: Tue Dec 03 2024 - 11:44:59 EST
Add futex_hash_get/put() to keep the assigned hash_bucket around while a
futex operation is performed. Have RCU lifetime guarantee for
futex_hash_bucket_private.
This is should have the right amount of gets/ puts so that the private
hash bucket is released on exit. This is preparatory work to allow
change the hash bucket at runtime.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
include/linux/futex.h | 2 +-
include/linux/mm_types.h | 5 +-
kernel/futex/core.c | 104 +++++++++++++++++++++++++++++++++------
kernel/futex/futex.h | 8 +++
kernel/futex/pi.c | 7 +++
kernel/futex/requeue.c | 16 ++++++
kernel/futex/waitwake.c | 15 +++++-
7 files changed, 136 insertions(+), 21 deletions(-)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 61e81b866d34e..359fc24eb37ff 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -84,7 +84,7 @@ void futex_hash_free(struct mm_struct *mm);
static inline void futex_mm_init(struct mm_struct *mm)
{
- mm->futex_hash_bucket = NULL;
+ rcu_assign_pointer(mm->futex_hash_bucket, NULL);
}
#else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b16b97ab8fb2a..4f39928631042 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -30,7 +30,7 @@
#define INIT_PASID 0
struct address_space;
-struct futex_hash_bucket;
+struct futex_hash_bucket_private;
struct mem_cgroup;
/*
@@ -903,8 +903,7 @@ struct mm_struct {
int mm_lock_seq;
#endif
- unsigned int futex_hash_mask;
- struct futex_hash_bucket *futex_hash_bucket;
+ struct futex_hash_bucket_private __rcu *futex_hash_bucket;
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 14251bbafaffb..464918d85395e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -40,6 +40,7 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
+#include <linux/rcuref.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -56,6 +57,12 @@ static struct {
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
+struct futex_hash_bucket_private {
+ rcuref_t users;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -127,17 +134,24 @@ static inline bool futex_key_is_private(union futex_key *key)
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- struct futex_hash_bucket *fhb;
+ struct futex_hash_bucket_private *hb_p = NULL;
u32 hash;
- fhb = current->mm->futex_hash_bucket;
- if (fhb && futex_key_is_private(key)) {
- u32 hash_mask = current->mm->futex_hash_mask;
+ if (futex_key_is_private(key)) {
+ guard(rcu)();
+
+ do {
+ hb_p = rcu_dereference(current->mm->futex_hash_bucket);
+ } while (hb_p && !rcuref_get(&hb_p->users));
+ }
+
+ if (hb_p) {
+ u32 hash_mask = hb_p->hash_mask;
hash = jhash2((void *)&key->private.address,
sizeof(key->private.address) / 4,
key->both.offset);
- return &fhb[hash & hash_mask];
+ return &hb_p->queues[hash & hash_mask];
}
hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / 4,
@@ -145,6 +159,35 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
return &futex_queues[hash & (futex_hashsize - 1)];
}
+static void futex_hash_priv_put(struct futex_hash_bucket_private *hb_p)
+{
+ if (rcuref_put(&hb_p->users))
+ kvfree_rcu(hb_p, rcu);
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+ struct futex_hash_bucket_private *hb_p;
+
+ if (hb->hb_slot == 0)
+ return;
+ hb_p = container_of(hb, struct futex_hash_bucket_private,
+ queues[hb->hb_slot - 1]);
+ futex_hash_priv_put(hb_p);
+}
+
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+ struct futex_hash_bucket_private *hb_p;
+
+ if (hb->hb_slot == 0)
+ return;
+
+ hb_p = container_of(hb, struct futex_hash_bucket_private,
+ queues[hb->hb_slot - 1]);
+ /* The ref needs to be owned by the caller so this can't fail */
+ WARN_ON_ONCE(!rcuref_get(&hb_p->users));
+}
/**
* futex_setup_timer - set up the sleeping hrtimer.
@@ -599,7 +642,10 @@ int futex_unqueue(struct futex_q *q)
*/
lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
+ struct futex_hash_bucket *hb;
+
spin_lock(lock_ptr);
+ hb = futex_hb_from_futex_q(q);
/*
* q->lock_ptr can change between reading it and
* spin_lock(), causing us to take the wrong lock. This
@@ -622,6 +668,7 @@ int futex_unqueue(struct futex_q *q)
BUG_ON(q->pi_state);
spin_unlock(lock_ptr);
+ futex_hash_put(hb);
ret = 1;
}
@@ -999,6 +1046,7 @@ static void exit_pi_state_list(struct task_struct *curr)
if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
+ futex_hash_put(hb);
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
@@ -1015,6 +1063,7 @@ static void exit_pi_state_list(struct task_struct *curr)
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
put_pi_state(pi_state);
continue;
}
@@ -1027,6 +1076,7 @@ static void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1147,8 +1197,9 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, unsigned int slot)
{
+ fhb->hb_slot = slot;
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
@@ -1156,12 +1207,20 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
void futex_hash_free(struct mm_struct *mm)
{
- kvfree(mm->futex_hash_bucket);
+ struct futex_hash_bucket_private *hb_p;
+
+ /* own a reference */
+ hb_p = rcu_dereference_check(mm->futex_hash_bucket, true);
+ if (!hb_p)
+ return;
+ WARN_ON(rcuref_read(&hb_p->users) != 1);
+ futex_hash_priv_put(hb_p);
}
static int futex_hash_allocate(unsigned int hash_slots)
{
- struct futex_hash_bucket *fhb;
+ struct futex_hash_bucket_private *hb_p;
+ size_t alloc_size;
int i;
if (current->mm->futex_hash_bucket)
@@ -1179,16 +1238,25 @@ static int futex_hash_allocate(unsigned int hash_slots)
if (!is_power_of_2(hash_slots))
hash_slots = rounddown_pow_of_two(hash_slots);
- fhb = kvmalloc_array(hash_slots, sizeof(struct futex_hash_bucket), GFP_KERNEL_ACCOUNT);
- if (!fhb)
+ if (unlikely(check_mul_overflow(hash_slots, sizeof(struct futex_hash_bucket),
+ &alloc_size)))
return -ENOMEM;
- current->mm->futex_hash_mask = hash_slots - 1;
+ if (unlikely(check_add_overflow(alloc_size, sizeof(struct futex_hash_bucket_private),
+ &alloc_size)))
+ return -ENOMEM;
+
+ hb_p = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!hb_p)
+ return -ENOMEM;
+
+ rcuref_init(&hb_p->users, 1);
+ hb_p->hash_mask = hash_slots - 1;
for (i = 0; i < hash_slots; i++)
- futex_hash_bucket_init(&fhb[i]);
+ futex_hash_bucket_init(&hb_p->queues[i], i + 1);
- current->mm->futex_hash_bucket = fhb;
+ rcu_assign_pointer(current->mm->futex_hash_bucket, hb_p);
return 0;
}
@@ -1199,8 +1267,12 @@ int futex_hash_allocate_default(void)
static int futex_hash_get_slots(void)
{
- if (current->mm->futex_hash_bucket)
- return current->mm->futex_hash_mask + 1;
+ struct futex_hash_bucket_private *hb_p;
+
+ guard(rcu)();
+ hb_p = rcu_dereference(current->mm->futex_hash_bucket);
+ if (hb_p)
+ return hb_p->hash_mask + 1;
return 0;
}
@@ -1243,7 +1315,7 @@ static int __init futex_init(void)
futex_hashsize = 1UL << futex_shift;
for (i = 0; i < futex_hashsize; i++)
- futex_hash_bucket_init(&futex_queues[i]);
+ futex_hash_bucket_init(&futex_queues[i], 0);
return 0;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 618ce1fe870e9..ceea260ad9e80 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -115,6 +115,7 @@ static inline bool should_fail_futex(bool fshared)
*/
struct futex_hash_bucket {
atomic_t waiters;
+ unsigned int hb_slot;
spinlock_t lock;
struct plist_head chain;
} ____cacheline_aligned_in_smp;
@@ -202,6 +203,13 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+extern void futex_hash_put(struct futex_hash_bucket *hb);
+extern void futex_hash_get(struct futex_hash_bucket *hb);
+
+static inline struct futex_hash_bucket *futex_hb_from_futex_q(struct futex_q *q)
+{
+ return container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+}
/**
* futex_match - Check whether two futex keys are equal
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index d62cca5ed8f4c..60a62ab250b08 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -964,6 +964,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
* - EAGAIN: The user space value changed.
*/
futex_q_unlock(hb);
+ futex_hash_put(hb);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
@@ -1083,10 +1084,12 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr);
+ futex_hash_put(hb);
goto out;
out_unlock_put_key:
futex_q_unlock(hb);
+ futex_hash_put(hb);
out:
if (to) {
@@ -1097,6 +1100,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
uaddr_faulted:
futex_q_unlock(hb);
+ futex_hash_put(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
@@ -1197,6 +1201,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
/* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
@@ -1236,6 +1241,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
*/
if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
switch (ret) {
case -EFAULT:
goto pi_faulted;
@@ -1256,6 +1262,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
out_unlock:
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
return ret;
pi_retry:
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index b47bb764b3520..39e96f1bef8ce 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -87,6 +87,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
+ futex_hash_put(hb1);
+ futex_hash_get(hb2);
}
q->key = *key2;
}
@@ -231,8 +233,10 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
+ futex_hash_put(futex_hb_from_futex_q(q));
q->lock_ptr = &hb->lock;
+ futex_hash_get(hb);
/* Signal locked state to the waiter */
futex_requeue_pi_complete(q, 1);
@@ -458,6 +462,8 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
futex_hb_waiters_dec(hb2);
+ futex_hash_put(hb1);
+ futex_hash_put(hb2);
ret = get_user(curval, uaddr1);
if (ret)
@@ -544,6 +550,8 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
case -EFAULT:
double_unlock_hb(hb1, hb2);
futex_hb_waiters_dec(hb2);
+ futex_hash_put(hb1);
+ futex_hash_put(hb2);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
@@ -558,6 +566,8 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
*/
double_unlock_hb(hb1, hb2);
futex_hb_waiters_dec(hb2);
+ futex_hash_put(hb1);
+ futex_hash_put(hb2);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
@@ -677,6 +687,8 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
futex_hb_waiters_dec(hb2);
+ futex_hash_put(hb1);
+ futex_hash_put(hb2);
return ret ? ret : task_count;
}
@@ -815,6 +827,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
if (futex_match(&q.key, &key2)) {
futex_q_unlock(hb);
+ futex_hash_put(hb);
ret = -EINVAL;
goto out;
}
@@ -828,6 +841,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, to);
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
break;
case Q_REQUEUE_PI_LOCKED:
@@ -847,6 +861,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
ret = ret < 0 ? ret : 0;
}
+ futex_hash_put(futex_hb_from_futex_q(&q));
break;
case Q_REQUEUE_PI_DONE:
@@ -876,6 +891,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr);
+ futex_hash_put(futex_hb_from_futex_q(&q));
if (ret == -EINTR) {
/*
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 3a10375d95218..1f2d11eb7f89f 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -113,6 +113,8 @@ bool __futex_wake_mark(struct futex_q *q)
return false;
__futex_unqueue(q);
+ /* Waiters reference */
+ futex_hash_put(futex_hb_from_futex_q(q));
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
* is written, without taking any locks. This is possible in the event
@@ -173,8 +175,10 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
hb = futex_hash(&key);
/* Make sure we really have tasks to wakeup */
- if (!futex_hb_waiters_pending(hb))
+ if (!futex_hb_waiters_pending(hb)) {
+ futex_hash_put(hb);
return ret;
+ }
spin_lock(&hb->lock);
@@ -196,6 +200,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
+ futex_hash_put(hb);
wake_up_q(&wake_q);
return ret;
}
@@ -275,6 +280,8 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
double_unlock_hb(hb1, hb2);
+ futex_hash_put(hb1);
+ futex_hash_put(hb2);
if (!IS_ENABLED(CONFIG_MMU) ||
unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
@@ -329,6 +336,8 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
out_unlock:
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
+ futex_hash_put(hb1);
+ futex_hash_put(hb2);
return ret;
}
@@ -466,6 +475,8 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
}
futex_q_unlock(hb);
+ futex_hash_put(hb);
+
__set_current_state(TASK_RUNNING);
/*
@@ -625,6 +636,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
if (ret) {
futex_q_unlock(*hb);
+ futex_hash_put(*hb);
ret = get_user(uval, uaddr);
if (ret)
@@ -638,6 +650,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
if (uval != val) {
futex_q_unlock(*hb);
+ futex_hash_put(*hb);
ret = -EWOULDBLOCK;
}
--
2.45.2