[PATCH for 5.9 v2 2/4] futex: implement FUTEX_SWAP as wake+wait.

From: Peter Oskolkov
Date: Mon Aug 03 2020 - 18:15:25 EST


From: Peter Oskolkov <posk@xxxxxxxxxx>

See the previous patch in the patchset, which introduced
FUTEX_SWAP op in futex.h, for a detailed description of
the use cases and future directions.

This patch implements FUTEX_SWAP as a simple wake+wait.
The next patch will improve this by migrating the wakee
to the waker's (= waiter's) CPU.

Tested: see patch 4 in this patchset.

Signed-off-by: Peter Oskolkov <posk@xxxxxxxxxx>
---
kernel/futex.c | 87 +++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 72 insertions(+), 15 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 4616d4ad609d..a81d62a16e72 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1574,16 +1574,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
}

/*
- * Wake up waiters matching bitset queued on this futex (uaddr).
+ * Prepare wake queue matching bitset queued on this futex (uaddr).
*/
static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset,
+ struct wake_q_head *wake_q)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
- DEFINE_WAKE_Q(wake_q);

if (!bitset)
return -EINVAL;
@@ -1611,13 +1611,26 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;

- mark_wake_futex(&wake_q, this);
+ mark_wake_futex(wake_q, this);
if (++ret >= nr_wake)
break;
}
}

spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ int ret;
+ DEFINE_WAKE_Q(wake_q);
+
+ ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q);
wake_up_q(&wake_q);
return ret;
}
@@ -2557,9 +2570,12 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* @hb: the futex hash bucket, must be locked by the caller
* @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ * @next: if present, wake next and hint to the scheduler that we'd
+ * prefer to execute it locally.
*/
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
+ struct hrtimer_sleeper *timeout,
+ struct task_struct *next)
{
/*
* The task state is guaranteed to be set before another task can
@@ -2584,10 +2600,26 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* flagged for rescheduling. Only call schedule if there
* is no timeout, or if it has yet to expire.
*/
- if (!timeout || timeout->task)
+ if (!timeout || timeout->task) {
+ if (next) {
+ /*
+ * wake_up_process() below will be
+ * replaced in the next patch with
+ * wake_up_swap().
+ */
+ wake_up_process(next);
+ put_task_struct(next);
+ next = NULL;
+ }
freezable_schedule();
+ }
}
__set_current_state(TASK_RUNNING);
+
+ if (next) {
+ wake_up_process(next);
+ put_task_struct(next);
+ }
}

/**
@@ -2663,7 +2695,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
}

static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
- ktime_t *abs_time, u32 bitset)
+ ktime_t *abs_time, u32 bitset, struct task_struct *next)
{
struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
@@ -2687,7 +2719,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
goto out;

/* queue_me and wait for wakeup, timeout, or a signal. */
- futex_wait_queue_me(hb, &q, to);
+ futex_wait_queue_me(hb, &q, to, next);
+ next = NULL;

/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
@@ -2720,6 +2753,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ret = -ERESTART_RESTARTBLOCK;

out:
+ if (next) {
+ wake_up_process(next);
+ put_task_struct(next);
+ }
if (to) {
hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
@@ -2727,7 +2764,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
return ret;
}

-
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = restart->futex.uaddr;
@@ -2739,10 +2775,29 @@ static long futex_wait_restart(struct restart_block *restart)
}
restart->fn = do_no_restart_syscall;

- return (long)futex_wait(uaddr, restart->futex.flags,
- restart->futex.val, tp, restart->futex.bitset);
+ return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val,
+ tp, restart->futex.bitset, NULL);
}

+static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 __user *uaddr2)
+{
+ u32 bitset = FUTEX_BITSET_MATCH_ANY;
+ struct task_struct *next = NULL;
+ DEFINE_WAKE_Q(wake_q);
+ int ret;
+
+ ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q);
+ if (ret < 0)
+ return ret;
+ if (!wake_q_empty(&wake_q)) {
+ /* At most one wakee can be present. Pull it out. */
+ next = container_of(wake_q.first, struct task_struct, wake_q);
+ next->wake_q.next = NULL;
+ }
+
+ return futex_wait(uaddr, flags, val, abs_time, bitset, next);
+}

/*
* Userspace tried a 0 -> TID atomic transition of the futex value
@@ -3221,7 +3276,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
}

/* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue_me(hb, &q, to);
+ futex_wait_queue_me(hb, &q, to, NULL);

spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3746,7 +3801,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
val3 = FUTEX_BITSET_MATCH_ANY;
/* fall through */
case FUTEX_WAIT_BITSET:
- return futex_wait(uaddr, flags, val, timeout, val3);
+ return futex_wait(uaddr, flags, val, timeout, val3, NULL);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
/* fall through */
@@ -3770,6 +3825,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
uaddr2);
case FUTEX_CMP_REQUEUE_PI:
return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ case FUTEX_SWAP:
+ return futex_swap(uaddr, flags, val, timeout, uaddr2);
}
return -ENOSYS;
}
@@ -3786,7 +3843,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,

if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT;
if (get_timespec64(&ts, utime))
@@ -3795,7 +3852,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return -EINVAL;

t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
+ if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
--
2.25.1