Re: [patch] futex: Cure exit race

From: Peter Zijlstra
Date: Wed Dec 12 2018 - 04:04:30 EST


On Mon, Dec 10, 2018 at 06:43:51PM +0100, Thomas Gleixner wrote:
> On Mon, 10 Dec 2018, Peter Zijlstra wrote:
> > On Mon, Dec 10, 2018 at 04:23:06PM +0100, Thomas Gleixner wrote:
> > There is another callers of futex_lock_pi_atomic(),
> > futex_proxy_trylock_atomic(), which is part of futex_requeue(), that too
> > does a retry loop on -EAGAIN.
> >
> > And there is another caller of attach_to_pi_owner(): lookup_pi_state(),
> > and that too is in futex_requeue() and handles the retry case properly.
> >
> > Yes, this all looks good.
> >
> > Acked-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
>
> Bah. The little devil in the unconcious part of my brain insisted on
> thinking further about that EAGAIN loop even despite my attempt to page
> that futex horrors out again immediately after sending that patch.
>
> There is another related issue which is even worse than just mildly
> confusing user space:
>
> task1(SCHED_OTHER)
> sys_exit()
> do_exit()
> exit_mm()
> task1->flags |= PF_EXITING;
>
> ---> preemption
>
> task2(SCHED_FIFO)
> sys_futex(LOCK_PI)
> ....
> attach_to_pi_owner() {
> ...
> if (!task1->flags & PF_EXITING) {
> attach();
> } else {
> if (!(tsk->flags & PF_EXITPIDONE))
> return -EAGAIN;
>
> Now assume UP or both tasks pinned on the same CPU. That results in a
> livelock because task2 is going to loop forever.
>
> No immediate idea how to cure that one w/o creating a mess.

One possible; but fairly gruesome hack; would be something like the
below.

Now, this obviously introduces a priority inversion, but that's
arguablly better than a live-lock, also I'm not sure there's really
anything 'sane' you can do in the case where your lock holder is dying
instead of doing a proper unlock anyway.

But no, I'm not liking this much either...

diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..bc6a01112d9d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -806,6 +806,8 @@ void __noreturn do_exit(long code)
* task into the wait for ever nirwana as well.
*/
tsk->flags |= PF_EXITPIDONE;
+ smp_mb();
+ wake_up_bit(&tsk->flags, 3 /* PF_EXITPIDONE */);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
diff --git a/kernel/futex.c b/kernel/futex.c
index f423f9b6577e..a743d657e783 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1148,8 +1148,8 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
*/
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
- struct futex_pi_state **ps)
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps, struct task_struct **pe)
{
pid_t pid = uval & FUTEX_TID_MASK;
struct futex_pi_state *pi_state;
@@ -1187,10 +1236,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
* set, we know that the task has finished the
* cleanup:
*/
int ret = handle_exit_race(uaddr, uval, p);

raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
+
+ if (ret == -EAGAIN)
+ *pe = p;
+ else
+ put_task_struct(p);
+
return ret;
}

@@ -1244,7 +1298,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps);
}

static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1282,7 +1336,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
struct futex_pi_state **ps,
- struct task_struct *task, int set_waiters)
+ struct task_struct *task, int set_waiters,
+ struct task_struct **exiting)
{
u32 uval, newval, vpid = task_pid_vnr(task);
struct futex_q *top_waiter;
@@ -1352,7 +1407,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
}

/**
@@ -2716,6 +2771,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
+ struct task_struct *exiting;
int res, ret;

if (!IS_ENABLED(CONFIG_FUTEX_PI))
@@ -2733,6 +2789,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
}

retry:
+ exiting = NULL;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -2740,7 +2797,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
retry_private:
hb = queue_lock(&q);

- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0, &exiting);
if (unlikely(ret)) {
/*
* Atomic work succeeded and we got the lock,
@@ -2762,6 +2819,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
*/
queue_unlock(hb);
put_futex_key(&q.key);
+
+ if (exiting) {
+ wait_bit(&exiting->flags, 3 /* PF_EXITPIDONE */, TASK_UNINTERRUPTIBLE);
+ put_task_struct(exiting);
+ }
+
cond_resched();
goto retry;
default: