[PATCH v2] rtmutex: take the waiter lock with irqs off

From: Sebastian Andrzej Siewior
Date: Fri Nov 15 2013 - 15:14:54 EST


Mike Galbraith captered the following:
| >#11 [ffff88017b243e90] _raw_spin_lock at ffffffff815d2596
| >#12 [ffff88017b243e90] rt_mutex_trylock at ffffffff815d15be
| >#13 [ffff88017b243eb0] get_next_timer_interrupt at ffffffff81063b42
| >#14 [ffff88017b243f00] tick_nohz_stop_sched_tick at ffffffff810bd1fd
| >#15 [ffff88017b243f70] tick_nohz_irq_exit at ffffffff810bd7d2
| >#16 [ffff88017b243f90] irq_exit at ffffffff8105b02d
| >#17 [ffff88017b243fb0] reschedule_interrupt at ffffffff815db3dd
| >--- <IRQ stack> ---
| >#18 [ffff88017a2a9bc8] reschedule_interrupt at ffffffff815db3dd
| > [exception RIP: task_blocks_on_rt_mutex+51]
| >#19 [ffff88017a2a9ce0] rt_spin_lock_slowlock at ffffffff815d183c
| >#20 [ffff88017a2a9da0] lock_timer_base.isra.35 at ffffffff81061cbf
| >#21 [ffff88017a2a9dd0] schedule_timeout at ffffffff815cf1ce
| >#22 [ffff88017a2a9e50] rcu_gp_kthread at ffffffff810f9bbb
| >#23 [ffff88017a2a9ed0] kthread at ffffffff810796d5
| >#24 [ffff88017a2a9f50] ret_from_fork at ffffffff815da04c

lock_timer_base() does a try_lock() which deadlocks on the waiter lock
not the lock itself.
This patch makes sure all users of the waiter_lock take the lock with
interrupts off so a try_lock from irq context is possible.

Cc: stable-rt@xxxxxxxxxxxxxxx
Reported-by: Mike Galbraith <bitbucket@xxxxxxxxx>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
v1âv2: rt_spin_lock_slowlock() and rt_mutex_slowlock() may be called very
early during the system boot with irq off if the fast path can't be
taken. This is the case if lockdep is on because it switches the cmpxchg
off. Instead always using _irqsave() variant I now restore flags in the
first trylock attempt. If that fails (the lock is taken) the code will
BUG() if the interruts were disabled.

kernel/futex.c | 16 +++---
kernel/rtmutex.c | 130 +++++++++++++++++++++++++++----------------------------
2 files changed, 74 insertions(+), 72 deletions(-)

--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -891,7 +891,7 @@ static int wake_futex_pi(u32 __user *uad
if (pi_state->owner != current)
return -EINVAL;

- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);

/*
@@ -917,21 +917,21 @@ static int wake_futex_pi(u32 __user *uad
else if (curval != uval)
ret = -EINVAL;
if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
}
}

- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);

- raw_spin_lock_irq(&new_owner->pi_lock);
+ raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
+ raw_spin_unlock(&new_owner->pi_lock);

raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
rt_mutex_unlock(&pi_state->pi_mutex);
@@ -1762,11 +1762,11 @@ static int fixup_owner(u32 __user *uaddr
* we returned due to timeout or signal without taking the
* rt_mutex. Too late.
*/
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
if (!owner)
owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -298,7 +298,7 @@ static int rt_mutex_adjust_prio_chain(st
plist_add(&waiter->list_entry, &lock->wait_list);

/* Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
if (!rt_mutex_owner(lock)) {
struct rt_mutex_waiter *lock_top_waiter;

@@ -309,7 +309,7 @@ static int rt_mutex_adjust_prio_chain(st
lock_top_waiter = rt_mutex_top_waiter(lock);
if (top_waiter != lock_top_waiter)
rt_mutex_wake_waiter(lock_top_waiter);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
goto out_put_task;
}
put_task_struct(task);
@@ -317,7 +317,7 @@ static int rt_mutex_adjust_prio_chain(st
/* Grab the next task */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);

if (waiter == rt_mutex_top_waiter(lock)) {
/* Boost the owner */
@@ -335,10 +335,10 @@ static int rt_mutex_adjust_prio_chain(st
__rt_mutex_adjust_prio(task);
}

- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);

top_waiter = rt_mutex_top_waiter(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -425,10 +425,9 @@ static int
/* We got the lock. */

if (waiter || rt_mutex_has_waiters(lock)) {
- unsigned long flags;
struct rt_mutex_waiter *top;

- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);

/* remove the queued waiter. */
if (waiter) {
@@ -445,7 +444,7 @@ static int
top->pi_list_entry.prio = top->list_entry.prio;
plist_add(&top->pi_list_entry, &task->pi_waiters);
}
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
}

debug_rt_mutex_lock(lock);
@@ -478,10 +477,9 @@ static int task_blocks_on_rt_mutex(struc
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- unsigned long flags;
int chain_walk = 0, res;

- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);

/*
* In the case of futex requeue PI, this will be a proxy
@@ -493,7 +491,7 @@ static int task_blocks_on_rt_mutex(struc
* the task if PI_WAKEUP_INPROGRESS is set.
*/
if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
return -EAGAIN;
}

@@ -512,20 +510,20 @@ static int task_blocks_on_rt_mutex(struc

task->pi_blocked_on = waiter;

- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);

if (!owner)
return 0;

if (waiter == rt_mutex_top_waiter(lock)) {
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
plist_add(&waiter->pi_list_entry, &owner->pi_waiters);

__rt_mutex_adjust_prio(owner);
if (rt_mutex_real_waiter(owner->pi_blocked_on))
chain_walk = 1;
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
}
else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
chain_walk = 1;
@@ -540,12 +538,12 @@ static int task_blocks_on_rt_mutex(struc
*/
get_task_struct(owner);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
task);

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);

return res;
}
@@ -560,9 +558,8 @@ static int task_blocks_on_rt_mutex(struc
static void wakeup_next_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;

- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);

waiter = rt_mutex_top_waiter(lock);

@@ -576,7 +573,7 @@ static void wakeup_next_waiter(struct rt

rt_mutex_set_owner(lock, NULL);

- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);

rt_mutex_wake_waiter(waiter);
}
@@ -592,20 +589,19 @@ static void remove_waiter(struct rt_mute
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
- unsigned long flags;
int chain_walk = 0;

- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
plist_del(&waiter->list_entry, &lock->wait_list);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);

if (!owner)
return;

if (first) {

- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);

plist_del(&waiter->pi_list_entry, &owner->pi_waiters);

@@ -620,7 +616,7 @@ static void remove_waiter(struct rt_mute
if (rt_mutex_real_waiter(owner->pi_blocked_on))
chain_walk = 1;

- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
}

WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -631,11 +627,11 @@ static void remove_waiter(struct rt_mute
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}

/*
@@ -723,9 +719,6 @@ static int adaptive_wait(struct rt_mutex
}
#endif

-# define pi_lock(lock) raw_spin_lock_irq(lock)
-# define pi_unlock(lock) raw_spin_unlock_irq(lock)
-
/*
* Slow path lock function spin_lock style: this variant is very
* careful not to miss any non-lock wakeups.
@@ -737,19 +730,22 @@ static void noinline __sched rt_spin_lo
{
struct task_struct *lock_owner, *self = current;
struct rt_mutex_waiter waiter, *top_waiter;
+ unsigned long flags;
int ret;

rt_mutex_init_waiter(&waiter, true);

- raw_spin_lock(&lock->wait_lock);
+ raw_local_save_flags(flags);
+ raw_spin_lock_irq(&lock->wait_lock);
init_lists(lock);

if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return;
}

BUG_ON(rt_mutex_owner(lock) == self);
+ BUG_ON(arch_irqs_disabled_flags(flags));

/*
* We save whatever state the task is in and we'll restore it
@@ -757,10 +753,10 @@ static void noinline __sched rt_spin_lo
* as well. We are serialized via pi_lock against wakeups. See
* try_to_wake_up().
*/
- pi_lock(&self->pi_lock);
+ raw_spin_lock(&self->pi_lock);
self->saved_state = self->state;
__set_current_state(TASK_UNINTERRUPTIBLE);
- pi_unlock(&self->pi_lock);
+ raw_spin_unlock(&self->pi_lock);

ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
BUG_ON(ret);
@@ -773,18 +769,18 @@ static void noinline __sched rt_spin_lo
top_waiter = rt_mutex_top_waiter(lock);
lock_owner = rt_mutex_owner(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

debug_rt_mutex_print_deadlock(&waiter);

if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
schedule_rt_mutex(lock);

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);

- pi_lock(&self->pi_lock);
+ raw_spin_lock(&self->pi_lock);
__set_current_state(TASK_UNINTERRUPTIBLE);
- pi_unlock(&self->pi_lock);
+ raw_spin_unlock(&self->pi_lock);
}

/*
@@ -794,10 +790,10 @@ static void noinline __sched rt_spin_lo
* happened while we were blocked. Clear saved_state so
* try_to_wakeup() does not get confused.
*/
- pi_lock(&self->pi_lock);
+ raw_spin_lock(&self->pi_lock);
__set_current_state(self->saved_state);
self->saved_state = TASK_RUNNING;
- pi_unlock(&self->pi_lock);
+ raw_spin_unlock(&self->pi_lock);

/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -808,7 +804,7 @@ static void noinline __sched rt_spin_lo
BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
BUG_ON(!plist_node_empty(&waiter.list_entry));

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

debug_rt_mutex_free_waiter(&waiter);
}
@@ -818,7 +814,9 @@ static void noinline __sched rt_spin_lo
*/
static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);

debug_rt_mutex_unlock(lock);

@@ -826,13 +824,13 @@ static void noinline __sched rt_spin_lo

if (!rt_mutex_has_waiters(lock)) {
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return;
}

wakeup_next_waiter(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

/* Undo pi boosting.when necessary */
rt_mutex_adjust_prio(current);
@@ -1037,13 +1035,13 @@ static int __sched
break;
}

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

debug_rt_mutex_print_deadlock(waiter);

schedule_rt_mutex(lock);

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}

@@ -1135,20 +1133,23 @@ rt_mutex_slowlock(struct rt_mutex *lock,
int detect_deadlock, struct ww_acquire_ctx *ww_ctx)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;

rt_mutex_init_waiter(&waiter, false);

- raw_spin_lock(&lock->wait_lock);
+ raw_local_save_flags(flags);
+ raw_spin_lock_irq(&lock->wait_lock);
init_lists(lock);

/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
if (ww_ctx)
ww_mutex_account_lock(lock, ww_ctx);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
+ BUG_ON(arch_irqs_disabled_flags(flags));

set_current_state(state);

@@ -1177,7 +1178,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
*/
fixup_rt_mutex_waiters(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

/* Remove pending timer: */
if (unlikely(timeout))
@@ -1194,9 +1195,10 @@ rt_mutex_slowlock(struct rt_mutex *lock,
static inline int
rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret = 0;

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
init_lists(lock);

if (likely(rt_mutex_owner(lock) != current)) {
@@ -1209,7 +1211,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo
fixup_rt_mutex_waiters(lock);
}

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

return ret;
}
@@ -1220,7 +1222,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lo
static void __sched
rt_mutex_slowunlock(struct rt_mutex *lock)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);

debug_rt_mutex_unlock(lock);

@@ -1228,13 +1232,13 @@ rt_mutex_slowunlock(struct rt_mutex *loc

if (!rt_mutex_has_waiters(lock)) {
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return;
}

wakeup_next_waiter(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

/* Undo pi boosting if necessary: */
rt_mutex_adjust_prio(current);
@@ -1494,10 +1498,10 @@ int rt_mutex_start_proxy_lock(struct rt_
{
int ret;

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);

if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}

@@ -1520,18 +1524,17 @@ int rt_mutex_start_proxy_lock(struct rt_
* PI_REQUEUE_INPROGRESS, so that if the task is waking up
* it will know that we are in the process of requeuing it.
*/
- raw_spin_lock_irq(&task->pi_lock);
+ raw_spin_lock(&task->pi_lock);
if (task->pi_blocked_on) {
- raw_spin_unlock_irq(&task->pi_lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return -EAGAIN;
}
task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
- raw_spin_unlock_irq(&task->pi_lock);
+ raw_spin_unlock(&task->pi_lock);
#endif

ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
-
if (ret && !rt_mutex_owner(lock)) {
/*
* Reset the return value. We might have
@@ -1545,7 +1548,7 @@ int rt_mutex_start_proxy_lock(struct rt_
if (unlikely(ret))
remove_waiter(lock, waiter);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

debug_rt_mutex_print_deadlock(waiter);

@@ -1595,12 +1598,11 @@ int rt_mutex_finish_proxy_lock(struct rt
{
int ret;

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);

set_current_state(TASK_INTERRUPTIBLE);

ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
-
set_current_state(TASK_RUNNING);

if (unlikely(ret))
@@ -1612,7 +1614,7 @@ int rt_mutex_finish_proxy_lock(struct rt
*/
fixup_rt_mutex_waiters(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

return ret;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/