[PATCH v4 08/20] TP-futex: Enable robust handling

From: Waiman Long
Date: Thu Dec 29 2016 - 11:14:30 EST


The TP futexes don't have code to handle the death of futex
owners. There are 2 different cases that need to be considered.

As top waiter gets a reference to the task structure of the futex
owner, the task structure will never go away even if the owner dies.
When the futex owner died while the top waiter is spinning, the task
structure will be marked dead or the pid won't have a matching task
structure if the task died before a reference is taken. Alternatively,
if robust futex attribute is enabled, the FUTEX_OWNER_DIED bit of the
futex word may also be set. In all those cases, what the top waiter
need to do is to grab the futex directly. An informational message
will be printed to highlight this event.

If the futex owner died while the top waiter is sleeping, we need to
make the exit processing code to wake up the top waiter. This is done
by chaining the futex state object into the pi_state_list of the futex
owner before the top waiter sleeps so that if exit_pi_state_list()
is called, the wakeup will happen. The top waiter needs to remove
its futex state object from the pi_state_list of the old owner if
the ownership changes hand or when the lock is acquired.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/futex.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 79 insertions(+), 6 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2374cce..2d3ec8d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1004,7 +1004,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
}

/*
- * This task is holding PI mutexes at exit time => bad.
+ * This task is holding PI or TP mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
@@ -1021,12 +1021,31 @@ void exit_pi_state_list(struct task_struct *curr)
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselves:
+ *
+ * For TP futexes, the only purpose of showing up in the
+ * pi_state_list is for this function to wake up the serialization
+ * mutex owner (state->mutex_owner). We don't actually need to take
+ * the HB lock. The futex state and task struct won't go away as long
+ * as we hold the pi_lock.
*/
raw_spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {

next = head->next;
pi_state = list_entry(next, struct futex_state, list);
+
+ if (pi_state->type == TYPE_TP) {
+ struct task_struct *owner;
+
+ owner = READ_ONCE(pi_state->mutex_owner);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+ if (owner)
+ wake_up_process(owner);
+ continue;
+ }
+
key = pi_state->key;
hb = hash_futex(&key);
raw_spin_unlock_irq(&curr->pi_lock);
@@ -3183,8 +3202,8 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
goto retry;

/*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
+ * Wake robust wait-wake futexes here. The wakeup of
+ * PI and TP futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS))
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
@@ -3325,6 +3344,12 @@ void exit_robust_list(struct task_struct *curr)
* Unlike the other futexes, the futex_q structures aren't used. Instead,
* they will queue up in the serialization mutex of the futex state container
* queued in the hash bucket.
+ *
+ * To handle the exceptional case that the futex owner died, the robust
+ * futexes list mechanism is used to for waking up sleeping top waiter.
+ * Checks are also made in the futex_spin_on_owner() loop for dead task
+ * structure or invalid pid. In both cases, the top waiter will take over
+ * the ownership of the futex.
*/

/**
@@ -3513,6 +3538,10 @@ static inline int futex_set_waiters_bit(u32 __user *uaddr, u32 *puval)
static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
struct futex_state *state)
{
+#define OWNER_DEAD_MESSAGE \
+ "futex: owner pid %d of TP futex 0x%lx was %s.\n" \
+ "\tLock is now acquired by pid %d!\n"
+
int ret;
u32 uval;
u32 owner_pid = 0;
@@ -3527,13 +3556,47 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
break;

if ((uval & FUTEX_TID_MASK) != owner_pid) {
- if (owner_task)
+ if (owner_task) {
+ /*
+ * task_pi_list_del() should always be
+ * done before put_task_struct(). The futex
+ * state may have been dequeued if the task
+ * is dead.
+ */
+ if (state->owner) {
+ WARN_ON(state->owner != owner_task);
+ task_pi_list_del(state, true);
+ }
put_task_struct(owner_task);
+ }

owner_pid = uval & FUTEX_TID_MASK;
owner_task = futex_find_get_task(owner_pid);
}

+ if (unlikely(!owner_task ||
+ (owner_task->flags & PF_EXITING) ||
+ (uval & FUTEX_OWNER_DIED))) {
+ /*
+ * PID invalid or exiting/dead task, we can directly
+ * grab the lock now.
+ */
+ u32 curval;
+ char *owner_state;
+
+ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval,
+ vpid);
+ if (unlikely(ret))
+ break;
+ if (curval != uval)
+ continue;
+ owner_state = (owner_task || (uval & FUTEX_OWNER_DIED))
+ ? "dead" : "invalid";
+ pr_info(OWNER_DEAD_MESSAGE, owner_pid,
+ (long)uaddr, owner_state, vpid);
+ break;
+ }
+
if (need_resched()) {
__set_current_state(TASK_RUNNING);
schedule_preempt_disabled();
@@ -3552,12 +3615,17 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,

/*
* If the owner isn't active, we need to go to sleep after
- * making sure that the FUTEX_WAITERS bit is set.
+ * making sure that the FUTEX_WAITERS bit is set. We also
+ * need to put the futex state into the futex owner's
+ * pi_state_list to prevent deadlock when the owner dies.
*/
ret = futex_set_waiters_bit(uaddr, &uval);
if (ret)
break;

+ if (owner_task && !state->owner)
+ task_pi_list_add(owner_task, state);
+
/*
* Do a trylock after setting the task state to make
* sure we won't miss a wakeup.
@@ -3593,8 +3661,13 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
goto retry;
}

- if (owner_task)
+ if (owner_task) {
+ if (state->owner)
+ task_pi_list_del(state, false);
put_task_struct(owner_task);
+ } else {
+ WARN_ON(state->owner);
+ }

/*
* Cleanup futex state.
--
1.8.3.1