[PATCH v10 04/21] futex: Create hb scopes

From: Sebastian Andrzej Siewior
Date: Wed Mar 12 2025 - 11:17:44 EST

Next message: Sebastian Andrzej Siewior: "[PATCH v10 09/21] futex: Allow automatic allocation of process wide futex hash."
Previous message: Sebastian Andrzej Siewior: "[PATCH v10 06/21] futex: Create helper function to initialize a hash slot."
In reply to: Sebastian Andrzej Siewior: "[PATCH v10 06/21] futex: Create helper function to initialize a hash slot."
Next in thread: Sebastian Andrzej Siewior: "[PATCH v10 09/21] futex: Allow automatic allocation of process wide futex hash."
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

Create explicit scopes for hb variables; almost pure re-indent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
kernel/futex/core.c | 81 ++++----
kernel/futex/pi.c | 282 +++++++++++++-------------
kernel/futex/requeue.c | 433 ++++++++++++++++++++--------------------
kernel/futex/waitwake.c | 193 +++++++++---------
4 files changed, 504 insertions(+), 485 deletions(-)

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 7adc914878933..e4cb5ce9785b1 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -944,7 +944,6 @@ static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;

/*
@@ -957,50 +956,54 @@ static void exit_pi_state_list(struct task_struct *curr)
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
- hb = futex_hash(&key);
+ if (1) {
+ struct futex_hash_bucket *hb;

- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ hb = futex_hash(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);

- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
}

- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);

diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 3bf942e9400ac..62ce5ecaeddd6 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -920,7 +920,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
struct hrtimer_sleeper timeout, *to;
struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q);
int res, ret;
@@ -939,152 +938,169 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
goto out;

retry_private:
- hb = futex_hash(&q.key);
- futex_q_lock(&q, hb);
+ if (1) {
+ struct futex_hash_bucket *hb;

- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
+ hb = futex_hash(&q.key);
+ futex_q_lock(&q, hb);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
/*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
*/
- futex_q_unlock(hb);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
}
- }

- WARN_ON(!q.pi_state);
+ WARN_ON(!q.pi_state);

- /*
- * Only actually queue now that the atomic ops are done:
- */
- __futex_queue(&q, hb, current);
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb, current);

- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }

- /*
- * Must be done before we enqueue the waiter, here is unfortunately
- * under the hb lock, but that *should* work because it does nothing.
- */
- rt_mutex_pre_schedule();
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();

- rt_mutex_init_waiter(&rt_waiter);
+ rt_mutex_init_waiter(&rt_waiter);

- /*
- * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
- raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);

- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }

- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);

- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);

cleanup:
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * must unwind the above, however we canont lock hb->lock because
- * rt_mutex already has a waiter enqueued and hb->lock can itself try
- * and enqueue an rt_waiter through rtlock.
- *
- * Doing the cleanup without holding hb->lock can cause inconsistent
- * state between hb and pi_state, but only in the direction of not
- * seeing a waiter that is leaving.
- *
- * See futex_unlock_pi(), it deals with this inconsistency.
- *
- * There be dragons here, since we must deal with the inconsistency on
- * the way out (here), it is impossible to detect/warn about the race
- * the other way around (missing an incoming waiter).
- *
- * What could possibly go wrong...
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * must unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
+ *
+ * What could possibly go wrong...
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;

- /*
- * Now that the rt_waiter has been dequeued, it is safe to use
- * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
- * the
- */
- spin_lock(q.lock_ptr);
- /*
- * Waiter is unqueued.
- */
- rt_mutex_post_schedule();
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ spin_lock(q.lock_ptr);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_pi_owner(uaddr, &q, !ret);
- /*
- * If fixup_pi_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;

- futex_unqueue_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ goto out;

out_unlock_put_key:
- futex_q_unlock(hb);
+ futex_q_unlock(hb);
+ goto out;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }

out:
if (to) {
@@ -1092,18 +1108,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
destroy_hrtimer_on_stack(&to->timer);
}
return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- futex_q_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
}

/*
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 0e55975af515c..209794cad6f2f 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -371,7 +371,6 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);

@@ -443,240 +442,244 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
if (requeue_pi && futex_match(&key1, &key2))
return -EINVAL;

- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- futex_hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
+ if (1) {
+ struct futex_hash_bucket *hb1, *hb2;

- if (likely(cmpval != NULL)) {
- u32 curval;
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);

- ret = futex_get_value_locked(&curval, uaddr1);
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);

- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
+ if (likely(cmpval != NULL)) {
+ u32 curval;

- ret = get_user(curval, uaddr1);
- if (ret)
- return ret;
+ ret = futex_get_value_locked(&curval, uaddr1);

- if (!(flags1 & FLAGS_SHARED))
- goto retry_private;
+ if (unlikely(ret)) {
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);

- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;

- if (requeue_pi) {
- struct task_struct *exiting = NULL;
+ if (!(flags1 & FLAGS_SHARED))
+ goto retry_private;

- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- *
- * Updates topwaiter::requeue_state if a top waiter exists.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
-
- /*
- * At this point the top_waiter has either taken uaddr2 or
- * is waiting on it. In both cases pi_state has been
- * established and an initial refcount on it. In case of an
- * error there's nothing.
- *
- * The top waiter's requeue_state is up to date:
- *
- * - If the lock was acquired atomically (ret == 1), then
- * the state is Q_REQUEUE_PI_LOCKED.
- *
- * The top waiter has been dequeued and woken up and can
- * return to user space immediately. The kernel/user
- * space state is consistent. In case that there must be
- * more waiters requeued the WAITERS bit in the user
- * space futex is set so the top waiter task has to go
- * into the syscall slowpath to unlock the futex. This
- * will block until this requeue operation has been
- * completed and the hash bucket locks have been
- * dropped.
- *
- * - If the trylock failed with an error (ret < 0) then
- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
- * happened", or Q_REQUEUE_PI_IGNORE when there was an
- * interleaved early wakeup.
- *
- * - If the trylock did not succeed (ret == 0) then the
- * state is either Q_REQUEUE_PI_IN_PROGRESS or
- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
- * This will be cleaned up in the loop below, which
- * cannot fail because futex_proxy_trylock_atomic() did
- * the same sanity checks for requeue_pi as the loop
- * below does.
- */
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- case 1:
- /*
- * futex_proxy_trylock_atomic() acquired the user space
- * futex. Adjust task_count.
- */
- task_count++;
- ret = 0;
- break;
-
- /*
- * If the above failed, then pi_state is NULL and
- * waiter::requeue_state is correct.
- */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
goto retry;
- return ret;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!futex_match(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
}

- /* Plain futexes just wake or requeue and are done */
- if (!requeue_pi) {
- if (++task_count <= nr_wake)
- this->wake(&wake_q, this);
- else
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
+
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
+ */
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!futex_match(&this->key, &key1))
+ continue;
+
+ /*
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ this->wake(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
+ */
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
requeue_futex(this, hb1, hb2, &key2);
- continue;
- }
-
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (!futex_match(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
}

/*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- *
- * Prepare the waiter to take the rt_mutex. Take a refcount
- * on the pi_state and store the pointer in the futex_q
- * object of the waiter.
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
*/
- get_pi_state(pi_state);
-
- /* Don't requeue when the waiter is already on the way out. */
- if (!futex_requeue_pi_prepare(this, pi_state)) {
- /*
- * Early woken waiter signaled that it is on the
- * way out. Drop the pi_state reference and try the
- * next waiter. @this->pi_state is still NULL.
- */
- put_pi_state(pi_state);
- continue;
- }
-
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
-
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the refcount
- * on pi_state nor clear this->pi_state because the
- * waiter needs the pi_state for cleaning up the
- * user space value. It will drop the refcount
- * after doing so. this::requeue_state is updated
- * in the wakeup as well.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- task_count++;
- } else if (!ret) {
- /* Waiter is queued, move it to hb2 */
- requeue_futex(this, hb1, hb2, &key2);
- futex_requeue_pi_complete(this, 0);
- task_count++;
- } else {
- /*
- * rt_mutex_start_proxy_lock() detected a potential
- * deadlock when we tried to queue that waiter.
- * Drop the pi_state reference which we took above
- * and remove the pointer to the state from the
- * waiters futex_q object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- futex_requeue_pi_complete(this, ret);
- /*
- * We stop queueing more waiters and let user space
- * deal with the mess.
- */
- break;
- }
- }
-
- /*
- * We took an extra initial reference to the pi_state in
- * futex_proxy_trylock_atomic(). We need to drop it here again.
- */
- put_pi_state(pi_state);
+ put_pi_state(pi_state);

out_unlock:
- double_unlock_hb(hb1, hb2);
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ }
wake_up_q(&wake_q);
- futex_hb_waiters_dec(hb2);
return ret ? ret : task_count;
}

diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 1108f373fd315..4bf839c85b66c 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -253,7 +253,6 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
DEFINE_WAKE_Q(wake_q);
@@ -266,67 +265,71 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
if (unlikely(ret != 0))
return ret;

- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
+ if (1) {
+ struct futex_hash_bucket *hb1, *hb2;

- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- return ret;
- }
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);

- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
return ret;
- }
-
- cond_resched();
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (futex_match (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
}
- this->wake(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }

- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (futex_match (&this->key, &key2)) {
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
+ }
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match (&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
}
this->wake(&wake_q, this);
- if (++op_ret >= nr_wake2)
+ if (++ret >= nr_wake)
break;
}
}
- ret += op_ret;
- }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }

out_unlock:
- double_unlock_hb(hb1, hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
return ret;
}
@@ -402,7 +405,6 @@ int futex_unqueue_multiple(struct futex_vector *v, int count)
*/
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
- struct futex_hash_bucket *hb;
bool retry = false;
int ret, i;
u32 uval;
@@ -441,21 +443,25 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
struct futex_q *q = &vs[i].q;
u32 val = vs[i].w.val;

- hb = futex_hash(&q->key);
- futex_q_lock(q, hb);
- ret = futex_get_value_locked(&uval, uaddr);
+ if (1) {
+ struct futex_hash_bucket *hb;

- if (!ret && uval == val) {
- /*
- * The bucket lock can't be held while dealing with the
- * next futex. Queue each futex at this moment so hb can
- * be unlocked.
- */
- futex_queue(q, hb, current);
- continue;
+ hb = futex_hash(&q->key);
+ futex_q_lock(q, hb);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb, current);
+ continue;
+ }
+
+ futex_q_unlock(hb);
}
-
- futex_q_unlock(hb);
__set_current_state(TASK_RUNNING);

/*
@@ -584,7 +590,6 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, union futex_key *key2,
struct task_struct *task)
{
- struct futex_hash_bucket *hb;
u32 uval;
int ret;

@@ -612,44 +617,48 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
return ret;

retry_private:
- hb = futex_hash(&q->key);
- futex_q_lock(q, hb);
+ if (1) {
+ struct futex_hash_bucket *hb;

- ret = futex_get_value_locked(&uval, uaddr);
+ hb = futex_hash(&q->key);
+ futex_q_lock(q, hb);

- if (ret) {
- futex_q_unlock(hb);
+ ret = futex_get_value_locked(&uval, uaddr);

- ret = get_user(uval, uaddr);
- if (ret)
- return ret;
+ if (ret) {
+ futex_q_unlock(hb);

- if (!(flags & FLAGS_SHARED))
- goto retry_private;
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;

- goto retry;
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+
+ if (uval != val) {
+ futex_q_unlock(hb);
+ return -EWOULDBLOCK;
+ }
+
+ if (key2 && futex_match(&q->key, key2)) {
+ futex_q_unlock(hb);
+ return -EINVAL;
+ }
+
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ if (task == current)
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb, task);
}

- if (uval != val) {
- futex_q_unlock(hb);
- return -EWOULDBLOCK;
- }
-
- if (key2 && futex_match(&q->key, key2)) {
- futex_q_unlock(hb);
- return -EINVAL;
- }
-
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * futex_queue() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- if (task == current)
- set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- futex_queue(q, hb, task);
-
return ret;
}

--
2.47.2

Next message: Sebastian Andrzej Siewior: "[PATCH v10 09/21] futex: Allow automatic allocation of process wide futex hash."
Previous message: Sebastian Andrzej Siewior: "[PATCH v10 06/21] futex: Create helper function to initialize a hash slot."
In reply to: Sebastian Andrzej Siewior: "[PATCH v10 06/21] futex: Create helper function to initialize a hash slot."
Next in thread: Sebastian Andrzej Siewior: "[PATCH v10 09/21] futex: Allow automatic allocation of process wide futex hash."
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]