Re: [PATCH v11 1/7] locking/mutex: Remove wakeups from under mutex::wait_lock

From: K Prateek Nayak
Date: Wed Jul 10 2024 - 13:44:04 EST


Hello John,

On 7/10/2024 2:01 AM, John Stultz wrote:
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

In preparation to nest mutex::wait_lock under rq::lock we need to remove
wakeups from under it.

Cc: Joel Fernandes <joelaf@xxxxxxxxxx>
Cc: Qais Yousef <qyousef@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Valentin Schneider <vschneid@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Zimuzo Ezeozue <zezeozue@xxxxxxxxxx>
Cc: Youssef Esmat <youssefesmat@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Will Deacon <will@xxxxxxxxxx>
Cc: Waiman Long <longman@xxxxxxxxxx>
Cc: Boqun Feng <boqun.feng@xxxxxxxxx>
Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx>
Cc: Metin Kaya <Metin.Kaya@xxxxxxx>
Cc: Xuewen Yan <xuewen.yan94@xxxxxxxxx>
Cc: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
Cc: kernel-team@xxxxxxxxxxx
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
[Heavily changed after 55f036ca7e74 ("locking: WW mutex cleanup") and
08295b3b5bee ("locking: Implement an algorithm choice for Wound-Wait
mutexes")]
Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxxx>
[jstultz: rebased to mainline, added extra wake_up_q & init
to avoid hangs, similar to Connor's rework of this patch]
Signed-off-by: John Stultz <jstultz@xxxxxxxxxx>
Tested-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Tested-by: Metin Kaya <metin.kaya@xxxxxxx>
Acked-by: Davidlohr Bueso <dave@xxxxxxxxxxxx>
Reviewed-by: Metin Kaya <metin.kaya@xxxxxxx>
---
v5:
* Reverted back to an earlier version of this patch to undo
the change that kept the wake_q in the ctx structure, as
that broke the rule that the wake_q must always be on the
stack, as its not safe for concurrency.
v6:
* Made tweaks suggested by Waiman Long
v7:
* Fixups to pass wake_qs down for PREEMPT_RT logic
v10:
* Switched preempt_enable to be lower close to the unlock as
suggested by Valentin
* Added additional preempt_disable coverage around the wake_q
calls as again noted by Valentin
---
kernel/locking/mutex.c | 17 +++++++++++++----
kernel/locking/rtmutex.c | 30 +++++++++++++++++++++---------
kernel/locking/rwbase_rt.c | 8 +++++++-
kernel/locking/rwsem.c | 4 ++--
kernel/locking/spinlock_rt.c | 3 ++-
kernel/locking/ww_mutex.h | 29 ++++++++++++++++++-----------
6 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cbae8c0b89ab..4269da1f3ef5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -575,6 +575,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
+ DEFINE_WAKE_Q(wake_q);
struct mutex_waiter waiter;
struct ww_mutex *ww;
int ret;
@@ -625,7 +626,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
*/
if (__mutex_trylock(lock)) {
if (ww_ctx)
- __ww_mutex_check_waiters(lock, ww_ctx);
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
goto skip_wait;
}
@@ -645,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* Add in stamp order, waking up waiters that must kill
* themselves.
*/
- ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q);
if (ret)
goto err_early_kill;
}
@@ -681,6 +682,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}
raw_spin_unlock(&lock->wait_lock);
+ /* Make sure we do wakeups before calling schedule */
+ if (!wake_q_empty(&wake_q)) {

nit.

This checks seems unnecessary (to my untrained eye). Any harm in
skipping it and simply doing a wake_up_q() followed by wake_q_init()
unconditionally?

+ wake_up_q(&wake_q);
+ wake_q_init(&wake_q);
+ }
schedule_preempt_disabled();
first = __mutex_waiter_is_first(lock, &waiter);
@@ -714,7 +720,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
*/
if (!ww_ctx->is_wait_die &&
!__mutex_waiter_is_first(lock, &waiter))
- __ww_mutex_check_waiters(lock, ww_ctx);
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
}
__mutex_remove_waiter(lock, &waiter);
@@ -730,6 +736,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
ww_mutex_lock_acquired(ww, ww_ctx);
raw_spin_unlock(&lock->wait_lock);
+ wake_up_q(&wake_q);
preempt_enable();
return 0;
@@ -741,6 +748,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
+ wake_up_q(&wake_q);
preempt_enable();
return ret;
}
@@ -951,9 +959,10 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
+ preempt_disable();
raw_spin_unlock(&lock->wait_lock);
-
wake_up_q(&wake_q);
+ preempt_enable();
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 88d08eeb8bc0..7a85d9bfa972 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -34,13 +34,15 @@
static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
struct rt_mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
{
return 0;
}
static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
{
}
@@ -1207,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
+ DEFINE_WAKE_Q(wake_q);
lockdep_assert_held(&lock->wait_lock);
@@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
/* Check whether the waiter should back out immediately */
rtm = container_of(lock, struct rt_mutex, rtmutex);
- res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+ preempt_disable();
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, &wake_q);
+ wake_up_q(&wake_q);
+ preempt_enable();

I'm trying to understand this - we enter task_blocks_on_rt_mutex() with
"wait_lock" held (I believe the lockdep_assert_held() in the previous
hunk checks for the same). I walked down the call chain (although
briefly) and could only spot "task->pi_lock" being locked and unlocked
before this call to "wake_up_q()" but the "wait_lock" seems to be held
throughout, only being unlocked and locked again for
"rt_mutex_adjust_prio_chain()" later down.

Did I miss something or is disabling preemption for this specific hunk
enough to enable safe nesting?
--
Thanks and Regards,
Prateek

if (res) {
raw_spin_lock(&task->pi_lock);
rt_mutex_dequeue(lock, waiter);
[..snip..]