[PATCH v4 14/20] TP-futex: Support userspace reader/writer locks

From: Waiman Long
Date: Thu Dec 29 2016 - 11:14:42 EST


The TP futexes was designed to support userspace mutually exclusive
locks. They are now extended to support userspace reader/writer
locks as well.

Two new futex command codes are added:
1) FUTEX_LOCK_SHARED - to acquire a shared lock (reader-lock)
2) FUTEX_UNLOCK_SHARED - to release a shred lock (reader-unlock)

The original FUTEX_LOCK and FUTEX_UNLOCK command codes acts like a
writer lock and unlock method in a userspace rwlock.

A special bit FUTEX_SHARED (bit 29) is used to indicate if a lock is
an exclusive (writer) or shared (reader) lock. Because of that, the
maximum TID value is now reduce to 1^29 instead of 1^30. With a shared
lock, the lower 24 bits are used to track the reader counts. Bit 28
is a special FUTEX_SHARED_UNLOCK bit that should used by userspace
to prevent racing in the FUTEX_UNLOCK_SHARED futex(2) syscall. Bits
24-27 are reserved for future extension.

When a TP futex becomes reader-owned, there is no way to tell if the
readers are running or not. So a fixed time interval (50us) is now
used for spinning on a reader-owned futex. The top waiter will abort
the spinning and goes to sleep when no activity is observed in the
futex value itself after the spinning timer expired. Activities here
means any change in the reader count of the futex indicating that
some new readers are coming in or old readers are leaving.

Sample code to acquire/release a reader lock in userspace are:

void read_lock(int *faddr)
{
int val = cmpxchg(faddr, 0, FUTEX_SHARED + 1);

if (!val)
return;
for (;;) {
int old = val, new;

if (!old)
new = FUTEX_SHARED + 1;
else if ((old & (~FUTEX_TID_SHARED_MASK|
FUTEX_SHARED_UNLOCK) ||
!(old & FUTEX_SHARED))
break;
else
new = old + 1;
val = cmpxchg(faddr, old, new);
if (val == old)
return;
}
while (futex(faddr, FUTEX_LOCK_SHARED, ...) < 0)
;
}

void read_unlock(int *faddr)
{
int val = xadd(faddr, -1);
int old;

for (;;) {
if ((val & (FUTEX_SCNT_MASK|FUTEX_SHARED_UNLOCK) ||
!(val & FUTEX_SHARED))
return; /* Not the last reader or shared */
if (val & ~FUTEX_SHARED_TID_MASK) {
old = cmpxchg(faddr, val, val|FUTEX_SHARED_UNLOCK);
if (old == val)
break;
} else {
old = cmpxchg(faddr, val, 0);
if (old == val)
return;
}
val = old;
}
futex(faddr, FUTEX_UNLOCK_SHARED, ...);
}

The read lock/unlock code is a bit more complicated than the
corresponding write lock/unlock code. This is by design to minimize
any additional overhead for mutex lock and unlock. As a result,
the TP futex rwlock prefers writers a bit more than readers.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
include/uapi/linux/futex.h | 28 +++++-
kernel/futex.c | 227 ++++++++++++++++++++++++++++++++++++++-------
2 files changed, 219 insertions(+), 36 deletions(-)

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 7f676ac..1a22ecc 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,8 @@
#define FUTEX_CMP_REQUEUE_PI 12
#define FUTEX_LOCK 13
#define FUTEX_UNLOCK 14
+#define FUTEX_LOCK_SHARED 15
+#define FUTEX_UNLOCK_SHARED 16

#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
@@ -43,6 +45,9 @@
FUTEX_PRIVATE_FLAG)
#define FUTEX_LOCK_PRIVATE (FUTEX_LOCK | FUTEX_PRIVATE_FLAG)
#define FUTEX_UNLOCK_PRIVATE (FUTEX_UNLOCK | FUTEX_PRIVATE_FLAG)
+#define FUTEX_LOCK_SHARED_PRIVATE (FUTEX_LOCK_SHARED | FUTEX_PRIVATE_FLAG)
+#define FUTEX_UNLOCK_SHARED_PRIVATE (FUTEX_UNLOCK_SHARED | \
+ FUTEX_PRIVATE_FLAG)

/*
* Support for robust futexes: the kernel cleans up held futexes at
@@ -111,9 +116,28 @@ struct robust_list_head {
#define FUTEX_OWNER_DIED 0x40000000

/*
- * The rest of the robust-futex field is for the TID:
+ * Shared locking bit (for readers).
+ * Robust futexes cannot be used in combination with shared locking.
+ *
+ * The FUTEX_SHARED_UNLOCK bit can be set by userspace to indicate that
+ * an shared unlock is in progress and so no shared locking is allowed
+ * within the kernel.
+ */
+#define FUTEX_SHARED 0x20000000
+#define FUTEX_SHARED_UNLOCK 0x10000000
+
+/*
+ * The rest of the bits is for the TID or, in the case of readers, the
+ * shared locking bit and the numbers of active readers present. IOW, the
+ * max supported TID is actually 0x1fffffff. Only 24 bits are used for
+ * tracking the number of shared futex owners. Bits 24-27 may be used by
+ * the userspace for internal management purpose. However, a successful
+ * FUTEX_UNLOCK_SHARED futex(2) syscall will clear or alter those bits.
*/
-#define FUTEX_TID_MASK 0x3fffffff
+#define FUTEX_TID_MASK 0x1fffffff
+#define FUTEX_SCNT_MASK 0x00ffffff /* Shared futex owner count */
+#define FUTEX_SHARED_TID_MASK 0x3fffffff /* FUTEX_SHARED bit + TID */
+#define FUTEX_SHARED_SCNT_MASK 0x20ffffff /* FUTEX_SHARED bit + count */

/*
* This limit protects against a deliberately circular list.
diff --git a/kernel/futex.c b/kernel/futex.c
index 429d85f..ee35264 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3176,7 +3176,7 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
if (get_user(uval, uaddr))
return -1;

- if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
+ if ((uval & FUTEX_SHARED_TID_MASK) == task_pid_vnr(curr)) {
/*
* Ok, this dying thread is truly holding a futex
* of interest. Set the OWNER_DIED bit atomically
@@ -3355,17 +3355,30 @@ void exit_robust_list(struct task_struct *curr)
* Checks are also made in the futex_spin_on_owner() loop for dead task
* structure or invalid pid. In both cases, the top waiter will take over
* the ownership of the futex.
+ *
+ * TP futexes can also be used as shared lock. However TP futexes in
+ * shared locking mode cannot be used with robust futex. Userspace
+ * reader/writer locks can now be implemented with TP futexes.
*/

/*
- * Timeout value for enabling lock handoff and prevent lock starvation.
+ * Timeout value for enabling lock handoff and preventing lock starvation.
*
* Currently, lock handoff will be enabled if the top waiter can't get the
* futex after about 5ms. To reduce time checking overhead, it is only done
* after every 128 spins or right after wakeup. As a result, the actual
* elapsed time will be a bit longer than the specified value.
+ *
+ * For reader-owned futex spinning, the timeout is 50us. The timeout value
+ * will be reset whenever activities are detected in the futex value. That
+ * means any changes in the reader count. However, reader spinning will never
+ * exceed the handoff timeout plus an additionl reader spinning timeout.
*/
#define TP_HANDOFF_TIMEOUT 5000000 /* 5ms */
+#define TP_RSPIN_TIMEOUT 50000 /* 50us */
+
+#define TP_FUTEX_SHARED_UNLOCK_READY(uval) \
+ (((uval) & FUTEX_SHARED_SCNT_MASK) == FUTEX_SHARED)

/*
* The futex_lock() function returns the internal status of the TP futex.
@@ -3383,7 +3396,7 @@ void exit_robust_list(struct task_struct *curr)
#define TP_LOCK_HANDOFF 2
#define TP_STATUS_SLEEP(val, sleep) ((val)|((sleep) << 16))

- /**
+/**
* lookup_futex_state - Looking up the futex state structure.
* @hb: hash bucket
* @key: futex key
@@ -3457,7 +3470,8 @@ static inline int put_futex_state_unlocked(struct futex_state *state)
* @waiter - true for top waiter, false otherwise
*
* The HB fs_lock should NOT be held while calling this function.
- * The flag bits are ignored in the trylock.
+ * For writers, the flag bits are ignored in the trylock. The workflow is
+ * as follows:
*
* If waiter is true
* then
@@ -3467,35 +3481,81 @@ static inline int put_futex_state_unlocked(struct futex_state *state)
* preserve the flag bits
* endif
*
+ * For readers (vpid == FUTEX_SHARED), the workflow is a bit different as
+ * shown in the code below.
+ *
* Return: TP_LOCK_ACQUIRED if lock acquired;
* TP_LOCK_HANDOFF if lock was handed off;
* 0 if lock acquisition failed;
* -EFAULT if an error happened.
* *puval will contain the latest futex value when trylock fails.
*/
-static inline int futex_trylock(u32 __user *uaddr, const u32 vpid, u32 *puval,
- const bool waiter)
+static int futex_trylock(u32 __user *uaddr, const u32 vpid, u32 *puval,
+ const bool waiter)
{
u32 uval, flags = 0;

if (unlikely(get_futex_value(puval, uaddr)))
return -EFAULT;

+ if (vpid == FUTEX_SHARED)
+ goto trylock_shared;
+
uval = *puval;

- if (waiter && (uval & FUTEX_TID_MASK) == vpid)
+ if (waiter && (uval & FUTEX_SHARED_TID_MASK) == vpid)
return TP_LOCK_HANDOFF;

- if (uval & FUTEX_TID_MASK)
+ if (uval & FUTEX_SHARED_TID_MASK)
return 0; /* Trylock fails */

if (!waiter)
- flags |= (uval & ~FUTEX_TID_MASK);
+ flags |= (uval & ~FUTEX_SHARED_TID_MASK);

if (unlikely(cmpxchg_futex_value(puval, uaddr, uval, vpid|flags)))
return -EFAULT;

return (*puval == uval) ? TP_LOCK_ACQUIRED : 0;
+
+trylock_shared:
+ for (;;) {
+ u32 new;
+
+ uval = *puval;
+
+ /*
+ * When it is not the top waiter, fail trylock attempt whenever
+ * any of the flag bits is set.
+ */
+ if (!waiter && (uval & ~FUTEX_SHARED_TID_MASK))
+ return 0;
+
+ /*
+ * 1. Increment the reader counts if FUTEX_SHARED bit is set
+ * but not the FUTEX_SHARED_UNLOCK bit; or
+ * 2. Change 0 => FUTEX_SHARED + 1.
+ */
+ if ((uval & (FUTEX_SHARED|FUTEX_SHARED_UNLOCK)) == FUTEX_SHARED)
+ new = uval + 1;
+ else if (!(uval & FUTEX_SHARED_TID_MASK))
+ new = uval | (FUTEX_SHARED + 1);
+ else
+ return 0;
+
+ if (unlikely(cmpxchg_futex_value(puval, uaddr, uval, new)))
+ return -EFAULT;
+
+ if (*puval == uval)
+ break;
+ }
+ /*
+ * If the original futex value is FUTEX_SHARED, it is assumed to be
+ * a handoff even though it can also be a reader-owned futex
+ * decremented to 0 reader.
+ */
+ return ((uval & FUTEX_SHARED_TID_MASK) == FUTEX_SHARED)
+ ? TP_LOCK_HANDOFF : TP_LOCK_ACQUIRED;
+
}

/**
@@ -3581,24 +3641,33 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
#define OWNER_DEAD_MESSAGE \
"futex: owner pid %d of TP futex 0x%lx was %s.\n" \
"\tLock is now acquired by pid %d!\n"
+#define OWNER_DEAD_MESSAGE_BY_READER \
+ "futex: owner pid %d of TP futex 0x%lx was %s.\n" \
+ "\tLock is now acquired by a reader!\n"

int ret, loopcnt = 1;
int nsleep = 0;
+ int reader_value = 0;
bool handoff_set = false;
u32 uval;
u32 owner_pid = 0;
struct task_struct *owner_task = NULL;
u64 handoff_time = sched_clock() + TP_HANDOFF_TIMEOUT;
+ u64 rspin_timeout;

preempt_disable();
WRITE_ONCE(state->mutex_owner, current);
retry:
for (;;) {
+ u32 new_pid;
+
ret = futex_trylock_preempt_disabled(uaddr, vpid, &uval);
if (ret)
break;

- if ((uval & FUTEX_TID_MASK) != owner_pid) {
+ new_pid = (uval & FUTEX_SHARED) ? FUTEX_SHARED
+ : (uval & FUTEX_TID_MASK);
+ if (new_pid != owner_pid) {
if (owner_task) {
/*
* task_pi_list_del() should always be
@@ -3613,13 +3682,25 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
put_task_struct(owner_task);
}

- owner_pid = uval & FUTEX_TID_MASK;
- owner_task = futex_find_get_task(owner_pid);
+ /*
+ * We need to reset the loop iteration count and
+ * reader value for each writer=>reader ownership
+ * transition so that we will have the right timeout
+ * value for reader spinning.
+ */
+ if ((owner_pid != FUTEX_SHARED) &&
+ (new_pid == FUTEX_SHARED))
+ loopcnt = reader_value = 0;
+
+ owner_pid = new_pid;
+ owner_task = (owner_pid == FUTEX_SHARED)
+ ? NULL : futex_find_get_task(owner_pid);
}

- if (unlikely(!owner_task ||
+ if (unlikely((owner_pid != FUTEX_SHARED) &&
+ (!owner_task ||
(owner_task->flags & PF_EXITING) ||
- (uval & FUTEX_OWNER_DIED))) {
+ (uval & FUTEX_OWNER_DIED)))) {
/*
* PID invalid or exiting/dead task, we can directly
* grab the lock now.
@@ -3635,8 +3716,12 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
continue;
owner_state = (owner_task || (uval & FUTEX_OWNER_DIED))
? "dead" : "invalid";
- pr_info(OWNER_DEAD_MESSAGE, owner_pid,
- (long)uaddr, owner_state, vpid);
+ if (vpid & FUTEX_SHARED)
+ pr_info(OWNER_DEAD_MESSAGE_BY_READER,
+ owner_pid, (long)uaddr, owner_state);
+ else
+ pr_info(OWNER_DEAD_MESSAGE, owner_pid,
+ (long)uaddr, owner_state, vpid);
break;
}

@@ -3664,19 +3749,75 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
* value. We also need to set the FUTEX_WAITERS bit to make
* sure that futex lock holder will initiate the handoff at
* unlock time.
+ *
+ * For reader, the handoff is done by setting just the
+ * FUTEX_SHARED bit. The top waiter still need to increment
+ * the reader count to get the lock.
*/
- if (!handoff_set && !(loopcnt++ & 0x7f)) {
- if (sched_clock() > handoff_time) {
+ if ((!handoff_set || (reader_value >= 0)) &&
+ !(loopcnt++ & 0x7f)) {
+ u64 curtime = sched_clock();
+
+ if (!owner_task && (reader_value >= 0)) {
+ int old_count = reader_value;
+
+ /*
+ * Reset timeout value if the old reader
+ * count is 0 or the reader value changes and
+ * handoff time hasn't been reached. Otherwise,
+ * disable reader spinning if the handoff time
+ * is reached.
+ */
+ reader_value = (uval & FUTEX_TID_MASK);
+ if (!old_count || (!handoff_set
+ && (reader_value != old_count)))
+ rspin_timeout = curtime +
+ TP_RSPIN_TIMEOUT;
+ else if (curtime > rspin_timeout)
+ reader_value = -1;
+ }
+ if (!handoff_set && (curtime > handoff_time)) {
WARN_ON(READ_ONCE(state->handoff_pid));
- ret = futex_set_waiters_bit(uaddr, &uval);
- if (ret)
- break;
WRITE_ONCE(state->handoff_pid, vpid);
+
+ /*
+ * Disable handoff check.
+ */
handoff_set = true;
}
}

- if (owner_task->on_cpu) {
+ if (handoff_set && !(uval & FUTEX_WAITERS)) {
+ /*
+ * This is reached either as a fall through after
+ * setting handoff_set or due to race in reader unlock
+ * where the task is woken up without the proper
+ * handoff. So we need to set the FUTEX_WAITERS bit
+ * as well as ensuring that the handoff_pid is
+ * properly set.
+ */
+ ret = futex_set_waiters_bit(uaddr, &uval);
+ if (ret)
+ break;
+ if (!READ_ONCE(state->handoff_pid))
+ WRITE_ONCE(state->handoff_pid, vpid);
+ }
+
+ if (!owner_task) {
+ /*
+ * For reader-owned futex, we optimistically spin
+ * on the lock until reader spinning is disabled.
+ *
+ * Ideally, !owner_task <=> (uval & FUTEX_SHARED).
+ * It is possible that there may be a mismatch here,
+ * but it should be corrected in the next iteration
+ * of the loop.
+ */
+ if (reader_value >= 0) {
+ cpu_relax();
+ continue;
+ }
+ } else if (owner_task->on_cpu) {
cpu_relax();
continue;
}
@@ -3758,6 +3899,8 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
* to see if it is running. It will also spin on the futex word so as to grab
* the lock as soon as it is free.
*
+ * For reader, TID = FUTEX_SHARED.
+ *
* This function is not inlined so that it can show up separately in perf
* profile for performance analysis purpose.
*
@@ -3769,14 +3912,14 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
*
* Return: TP status code if lock acquired, < 0 if an error happens.
*/
-static noinline int
-futex_lock(u32 __user *uaddr, unsigned int flags, ktime_t *time)
+static noinline int futex_lock(u32 __user *uaddr, unsigned int flags,
+ ktime_t *time, const bool shared)
{
struct hrtimer_sleeper timeout, *to;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
struct futex_state *state;
- u32 uval, vpid = task_pid_vnr(current);
+ u32 uval, vpid = shared ? FUTEX_SHARED : task_pid_vnr(current);
int ret;

/*
@@ -3789,7 +3932,7 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
/*
* Detect deadlocks.
*/
- if (unlikely(((uval & FUTEX_TID_MASK) == vpid) ||
+ if (unlikely((!shared && ((uval & FUTEX_TID_MASK) == vpid)) ||
should_fail_futex(true)))
return -EDEADLK;

@@ -3878,12 +4021,18 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
* This is the in-kernel slowpath: we look up the futex state (if any),
* and wakeup the mutex owner.
*
+ * For shared unlock, userspace code must make sure that there is no
+ * racing and only one task will go into the kernel to do the unlock.
+ * The reader count shouldn't be incremented from 0 while the unlock is
+ * in progress.
+ *
* Return: 1 if a wakeup is attempt, 0 if no task to wake,
* or < 0 when an error happens.
*/
-static int futex_unlock(u32 __user *uaddr, unsigned int flags)
+static int futex_unlock(u32 __user *uaddr, unsigned int flags,
+ const bool shared)
{
- u32 uval, vpid = task_pid_vnr(current);
+ u32 uval, vpid = shared ? FUTEX_SHARED : task_pid_vnr(current);
u32 newpid = 0;
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
@@ -3895,8 +4044,12 @@ static int futex_unlock(u32 __user *uaddr, unsigned int flags)
if (get_futex_value(&uval, uaddr))
return -EFAULT;

- if ((uval & FUTEX_TID_MASK) != vpid)
+ if (uval & FUTEX_SHARED) {
+ if (!shared || !TP_FUTEX_SHARED_UNLOCK_READY(uval))
+ return -EPERM;
+ } else if (shared || ((uval & FUTEX_TID_MASK) != vpid)) {
return -EPERM;
+ }

if (!(uval & FUTEX_WAITERS))
return -EINVAL;
@@ -3925,7 +4078,6 @@ static int futex_unlock(u32 __user *uaddr, unsigned int flags)
owner = READ_ONCE(state->mutex_owner);
if (owner)
wake_q_add(&wake_q, owner);
-
spin_unlock(&hb->fs_lock);

/*
@@ -3944,7 +4096,8 @@ static int futex_unlock(u32 __user *uaddr, unsigned int flags)
/*
* The non-flag value of the futex shouldn't change
*/
- if ((uval & FUTEX_TID_MASK) != vpid) {
+ if ((uval & (shared ? FUTEX_SHARED_SCNT_MASK : FUTEX_TID_MASK))
+ != vpid) {
ret = -EINVAL;
break;
}
@@ -3957,7 +4110,7 @@ static int futex_unlock(u32 __user *uaddr, unsigned int flags)
* No error would have happened if owner defined.
*/
wake_up_q(&wake_q);
- return 1;
+ return ret ? ret : 1;
}

return ret;
@@ -3989,6 +4142,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
#ifdef CONFIG_SMP
case FUTEX_LOCK:
case FUTEX_UNLOCK:
+ case FUTEX_LOCK_SHARED:
+ case FUTEX_UNLOCK_SHARED:
#endif
if (!futex_cmpxchg_enabled)
return -ENOSYS;
@@ -4023,9 +4178,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
#ifdef CONFIG_SMP
case FUTEX_LOCK:
- return futex_lock(uaddr, flags, timeout);
+ case FUTEX_LOCK_SHARED:
+ return futex_lock(uaddr, flags, timeout,
+ (cmd == FUTEX_LOCK) ? false : true);
case FUTEX_UNLOCK:
- return futex_unlock(uaddr, flags);
+ case FUTEX_UNLOCK_SHARED:
+ return futex_unlock(uaddr, flags,
+ (cmd == FUTEX_UNLOCK) ? false : true);
#endif
}
return -ENOSYS;
--
1.8.3.1