Re: Linux 4.4.280

From: Greg Kroah-Hartman
Date: Tue Aug 10 2021 - 12:11:33 EST


diff --git a/Makefile b/Makefile
index 7dc479e9a665..870bd763830e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 4
-SUBLEVEL = 279
+SUBLEVEL = 280
EXTRAVERSION =
NAME = Blurry Fish Butt

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0a93e9d1708e..3072e9c93ae6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -880,9 +880,7 @@ static __always_inline void rcu_read_lock(void)
* Unfortunately, this function acquires the scheduler's runqueue and
* priority-inheritance spinlocks. This means that deadlock could result
* if the caller of rcu_read_unlock() already holds one of these locks or
- * any lock that is ever acquired while holding them; or any lock which
- * can be taken from interrupt context because rcu_boost()->rt_mutex_lock()
- * does not disable irqs while taking ->wait_lock.
+ * any lock that is ever acquired while holding them.
*
* That said, RCU readers are never priority boosted unless they were
* preempted. Therefore, one way to avoid deadlock is to make sure
diff --git a/kernel/futex.c b/kernel/futex.c
index ff5499b0c5b3..6d47b7dc1cfb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -825,7 +825,7 @@ static int refill_pi_state_cache(void)
return 0;
}

-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;

@@ -858,10 +858,18 @@ static void pi_state_update_owner(struct futex_pi_state *pi_state,
}
}

+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
@@ -898,7 +906,7 @@ static void free_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;

@@ -958,10 +966,12 @@ static void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);

- rt_mutex_futex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);

+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -1075,6 +1085,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
* has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
* which in turn means that futex_lock_pi() still has a reference on
* our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
*/
WARN_ON(!atomic_read(&pi_state->refcount));

@@ -1146,7 +1161,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
goto out_einval;

out_attach:
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
@@ -1529,48 +1544,35 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
q->lock_ptr = NULL;
}

-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool deboost = false;
WAKE_Q(wake_q);
- bool deboost;
int ret = 0;

- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
-
- /*
- * When we interleave with futex_lock_pi() where it does
- * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
- * but the rt_mutex's wait_list can be empty (either still, or again,
- * depending on which side we land).
- *
- * When this happens, give up our locks and try again, giving the
- * futex_lock_pi() instance time to complete, either by waiting on the
- * rtmutex or removing itself from the futex queue.
- */
- if (!new_owner) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return -EAGAIN;
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
}

/*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);

@@ -1603,15 +1605,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
}

+out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);

if (deboost) {
wake_up_q(&wake_q);
rt_mutex_adjust_prio(current);
}

- return 0;
+ return ret;
}

/*
@@ -2121,7 +2123,7 @@ retry_private:
case 0:
break;
case -EFAULT:
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -2139,7 +2141,7 @@ retry_private:
* exit to complete.
* - EAGAIN: The user space value changed.
*/
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -2201,7 +2203,7 @@ retry_private:
*/
if (requeue_pi) {
/* Prepare the waiter to take the rt_mutex. */
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -2214,7 +2216,7 @@ retry_private:
} else if (ret) {
/* -EDEADLK */
this->pi_state = NULL;
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
goto out_unlock;
}
}
@@ -2223,7 +2225,7 @@ retry_private:
}

out_unlock:
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
@@ -2277,20 +2279,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}

-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;

@@ -2307,6 +2296,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}

@@ -2376,7 +2383,7 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q);

BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;

spin_unlock(q->lock_ptr);
@@ -2430,10 +2437,22 @@ retry:
}

/*
- * Since we just failed the trylock; there must be an owner.
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_fault;
+ }
} else {
WARN_ON_ONCE(argowner != current);
if (oldowner == current) {
@@ -2454,7 +2473,7 @@ retry:
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;

- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;

if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2812,6 +2831,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2872,24 +2892,51 @@ retry_private:
}
}

+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);

- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
+ if (trylock) {
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
+
+ /*
+ * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+ * such that the hb and rt_mutex wait lists match.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ goto no_block;
}

+ spin_unlock(q.lock_ptr);
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
+ /*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2913,8 +2960,10 @@ out_unlock_put_key:
out_put_key:
put_futex_key(&q.key);
out:
- if (to)
+ if (to) {
+ hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
+ }
return ret != -EINTR ? ret : -ERESTARTNOINTR;

uaddr_faulted:
@@ -2967,10 +3016,39 @@ retry:
*/
match = futex_top_waiter(hb, &key);
if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ struct futex_pi_state *pi_state = match->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * Since modifying the wait_list is done while holding both
+ * hb->lock and wait_lock, holding either is sufficient to
+ * observe it.
+ *
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2985,7 +3063,6 @@ retry:
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2993,7 +3070,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}

/*
@@ -3003,8 +3080,10 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }

/*
* If uval has changed, let user space handle it.
@@ -3018,7 +3097,6 @@ out_putkey:
return ret;

pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);

ret = fault_in_user_writeable(uaddr);
@@ -3148,10 +3226,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);

ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -3210,7 +3285,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
*/
- free_pi_state(q.pi_state);
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
/*
* Adjust the return value. It's either -EFAULT or
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1c0cb5c3c6ad..532986d82179 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -163,13 +163,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);

clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -211,11 +212,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
@@ -497,7 +499,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int ret = 0, depth = 0;
struct rt_mutex *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;

detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -540,7 +541,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);

/*
* [2] Get the waiter on which @task is blocked on.
@@ -624,7 +625,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* operations.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -655,7 +656,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);

/*
@@ -663,14 +664,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}

/* [10] Grab the next task, i.e. owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);

/*
* No requeue [11] here. We just do deadlock detection.
@@ -685,8 +686,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);

/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -707,7 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_enqueue(lock, waiter);

/* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);

/*
@@ -725,14 +726,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}

/* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);

/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -786,8 +787,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);

/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

/*
* Make the actual exit decisions [12], based on the stored
@@ -810,7 +811,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;

out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);

@@ -820,7 +821,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
@@ -830,8 +831,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
-
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -916,7 +915,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
@@ -925,7 +924,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);

takeit:
/* We got the lock. */
@@ -945,7 +944,7 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -956,7 +955,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
- unsigned long flags;

/*
* Early deadlock detection. We really don't want the task to
@@ -970,7 +968,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (owner == task)
return -EDEADLK;

- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -983,12 +981,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,

task->pi_blocked_on = waiter;

- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);

if (!owner)
return 0;

- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
@@ -1003,7 +1001,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);

- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -1019,12 +1017,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);

return res;
}
@@ -1033,15 +1031,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Remove the top waiter from the current tasks pi waiter tree and
* queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;

- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);

waiter = rt_mutex_top_waiter(lock);

@@ -1063,7 +1060,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;

- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);

wake_q_add(wake_q, waiter->task);
}
@@ -1071,7 +1068,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
* have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
@@ -1080,12 +1077,11 @@ static void remove_waiter(struct rt_mutex *lock,
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
- unsigned long flags;

- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);

/*
* Only update priority if the waiter was the highest priority
@@ -1094,7 +1090,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;

- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);

rt_mutex_dequeue_pi(owner, waiter);

@@ -1106,7 +1102,7 @@ static void remove_waiter(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);

- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);

/*
* Don't walk the chain, if the owner task is not blocked
@@ -1118,12 +1114,12 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}

/*
@@ -1155,15 +1151,23 @@ void rt_mutex_adjust_pi(struct task_struct *task)
next_lock, NULL, task);
}

+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1191,13 +1195,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

debug_rt_mutex_print_deadlock(waiter);

schedule();

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}

@@ -1234,17 +1238,24 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;

- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter);

- raw_spin_lock(&lock->wait_lock);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);

/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}

@@ -1273,7 +1284,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
fixup_rt_mutex_waiters(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

/* Remove pending timer: */
if (unlikely(timeout))
@@ -1302,6 +1313,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret;

/*
@@ -1313,14 +1325,14 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return 0;

/*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
*/
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);

ret = __rt_mutex_slowtrylock(lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

return ret;
}
@@ -1332,7 +1344,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);

debug_rt_mutex_unlock(lock);

@@ -1369,10 +1384,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
+ if (unlock_rt_mutex_safe(lock, flags) == true)
return false;
/* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}

/*
@@ -1383,7 +1398,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
mark_wakeup_next_waiter(wake_q, lock);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

/* check PI boosting */
return true;
@@ -1482,19 +1497,6 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);

-/*
- * Futex variant with full deadlock detection.
- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
- */
-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
-{
- might_sleep();
-
- return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
- timeout, RT_MUTEX_FULL_CHAINWALK);
-}
-
/*
* Futex variant, must not use fastpath.
*/
@@ -1687,10 +1689,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
{
int ret;

- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);

if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}

@@ -1711,7 +1713,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);

- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

debug_rt_mutex_print_deadlock(waiter);

@@ -1761,20 +1763,16 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
{
int ret;

- raw_spin_lock(&lock->wait_lock);
-
- set_current_state(TASK_INTERRUPTIBLE);
-
+ raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);

return ret;
}
@@ -1804,15 +1802,32 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
bool cleanup = false;

raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
/*
* Unless we're the owner; we're still enqueued on the wait_list.
* So check if we became owner, if not, take us off the wait_list.
*/
if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
- fixup_rt_mutex_waiters(lock);
cleanup = true;
}
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
raw_spin_unlock_irq(&lock->wait_lock);

return cleanup;
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4584db96265d..97c048c494f0 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,7 @@ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
@@ -110,7 +111,6 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
extern int __rt_mutex_futex_trylock(struct rt_mutex *l);