Re: [PATCH 04/10] locking: export osq_lock()/osq_unlock()

From: Kent Overstreet
Date: Fri May 18 2018 - 07:44:24 EST


On Fri, May 18, 2018 at 01:40:54PM +0200, Peter Zijlstra wrote:
> On Fri, May 18, 2018 at 07:32:05AM -0400, Kent Overstreet wrote:
> > It does strike me that the whole optimistic spin algorithm
> > (mutex_optimistic_spin() and rwsem_optimistic_spin()) are ripe for factoring
> > out. They've been growing more optimizations I see, and the optimizations mostly
> > aren't specific to either locks.
>
> There's a few unfortunate differences between the two; but yes it would
> be good if we could reduce some of the duplication found there.
>
> One of the distinct differences is that mutex (now) has the lock state
> and owner in a single atomic word, while rwsem still tracks the owner in
> a separate word and thus needs to account for 'inconsistent' owner
> state.
>
> And then there's warts such as ww_mutex and rwsem_owner_is_reader and
> similar.

The rwsem code seems workable, osq_optimistic_spin() takes callback for trylock
and get_owner - I just added a OWNER_NO_SPIN value that get_owner() can return.

The mutex code though... wtf...


commit 5c7defe81ee722f5087cbeda9be6e7ee715515d7
Author: Kent Overstreet <kent.overstreet@xxxxxxxxx>
Date: Fri May 18 08:35:55 2018 -0400

Factor out osq_optimistic_spin()

diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index afa59a476a..dbaf52abef 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -1,5 +1,6 @@

#include <linux/log2.h>
+#include <linux/osq_optimistic_spin.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
@@ -146,127 +147,26 @@ struct six_lock_waiter {
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))

-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-
-static inline int six_can_spin_on_owner(struct six_lock *lock)
-{
- struct task_struct *owner;
- int retval = 1;
-
- if (need_resched())
- return 0;
-
- rcu_read_lock();
- owner = READ_ONCE(lock->owner);
- if (owner)
- retval = owner->on_cpu;
- rcu_read_unlock();
- /*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
- */
- return retval;
-}
-
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner)
+static inline struct task_struct *six_osq_get_owner(struct optimistic_spin_queue *osq)
{
- bool ret = true;
-
- rcu_read_lock();
- while (lock->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- if (!owner->on_cpu || need_resched()) {
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
+ struct six_lock *lock = container_of(osq, struct six_lock, osq);

- return ret;
+ return lock->owner;
}

-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_osq_trylock_read(struct optimistic_spin_queue *osq)
{
- struct task_struct *task = current;
-
- if (type == SIX_LOCK_write)
- return false;
-
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
-
- if (!osq_lock(&lock->osq))
- goto fail;
-
- while (1) {
- struct task_struct *owner;
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner))
- break;
+ struct six_lock *lock = container_of(osq, struct six_lock, osq);

- if (do_six_trylock_type(lock, type)) {
- osq_unlock(&lock->osq);
- preempt_enable();
- return true;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
-
- osq_unlock(&lock->osq);
-fail:
- preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
- return false;
+ return do_six_trylock_type(lock, SIX_LOCK_read);
}

-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_osq_trylock_intent(struct optimistic_spin_queue *osq)
{
- return false;
-}
+ struct six_lock *lock = container_of(osq, struct six_lock, osq);

-#endif
+ return do_six_trylock_type(lock, SIX_LOCK_intent);
+}

noinline
static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
@@ -276,8 +176,20 @@ static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type t
struct six_lock_waiter wait;
u64 v;

- if (six_optimistic_spin(lock, type))
- return;
+ switch (type) {
+ case SIX_LOCK_read:
+ if (osq_optimistic_spin(&lock->osq, six_osq_get_owner,
+ six_osq_trylock_read))
+ return;
+ break;
+ case SIX_LOCK_intent:
+ if (osq_optimistic_spin(&lock->osq, six_osq_get_owner,
+ six_osq_trylock_intent))
+ return;
+ break;
+ case SIX_LOCK_write:
+ break;
+ }

lock_contended(&lock->dep_map, _RET_IP_);

diff --git a/include/linux/osq_optimistic_spin.h b/include/linux/osq_optimistic_spin.h
new file mode 100644
index 0000000000..1f5fd1f85b
--- /dev/null
+++ b/include/linux/osq_optimistic_spin.h
@@ -0,0 +1,155 @@
+#ifndef __LINUX_OSQ_OPTIMISTIC_SPIN_H
+#define __LINUX_OSQ_OPTIMISTIC_SPIN_H
+
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+
+typedef struct task_struct *(*osq_get_owner_fn)(struct optimistic_spin_queue *osq);
+typedef bool (*osq_trylock_fn)(struct optimistic_spin_queue *osq);
+
+#define OWNER_NO_SPIN ((struct task_struct *) 1UL)
+
+static inline bool osq_can_spin_on_owner(struct optimistic_spin_queue *lock,
+ osq_get_owner_fn get_owner)
+{
+ struct task_struct *owner;
+ bool ret;
+
+ if (need_resched())
+ return false;
+
+ rcu_read_lock();
+ owner = get_owner(lock);
+ /*
+ * if lock->owner is not set, the lock owner may have just acquired
+ * it and not set the owner yet, or it may have just been unlocked
+ */
+ if (!owner)
+ ret = true;
+ else if (owner == OWNER_NO_SPIN)
+ ret = false;
+ else
+ ret = owner->on_cpu != 0;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool osq_spin_on_owner(struct optimistic_spin_queue *lock,
+ struct task_struct *owner,
+ osq_get_owner_fn get_owner)
+{
+ if (!owner)
+ return true;
+
+ if (owner == OWNER_NO_SPIN)
+ return false;
+
+ while (1) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ if (get_owner(lock) != owner)
+ return true;
+
+ if (!owner->on_cpu || need_resched() ||
+ vcpu_is_preempted(task_cpu(owner)))
+ return false;
+
+ cpu_relax();
+ }
+}
+
+static inline bool osq_optimistic_spin(struct optimistic_spin_queue *lock,
+ osq_get_owner_fn get_owner,
+ osq_trylock_fn trylock)
+{
+ struct task_struct *task = current;
+
+ preempt_disable();
+ if (!osq_can_spin_on_owner(lock, get_owner))
+ goto fail;
+
+ if (!osq_lock(lock))
+ goto fail;
+
+ while (1) {
+ struct task_struct *owner;
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ rcu_read_lock();
+ owner = get_owner(lock);
+ if (!osq_spin_on_owner(lock, owner, get_owner)) {
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+
+ if (trylock(lock)) {
+ osq_unlock(lock);
+ preempt_enable();
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+
+ osq_unlock(lock);
+fail:
+ preempt_enable();
+
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock again. This avoids getting
+ * scheduled out right after we obtained the lock.
+ */
+ if (need_resched())
+ schedule();
+
+ return false;
+}
+
+static inline bool osq_has_spinner(struct optimistic_spin_queue *lock)
+{
+ return osq_is_locked(lock);
+}
+
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool osq_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+ return false;
+}
+
+static inline bool osq_has_spinner(struct optimistic_spin_queue *lock)
+{
+ return false;
+}
+
+#endif
+
+#endif /* __LINUX_OSQ_OPTIMISTIC_SPIN_H */
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f36..a25ee6668f 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -13,6 +13,7 @@
#include <linux/rwsem.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/osq_optimistic_spin.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
#include <linux/sched/wake_q.h>
@@ -325,11 +326,21 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
}

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+
+static inline struct task_struct *rwsem_osq_get_owner(struct optimistic_spin_queue *osq)
+{
+ struct rw_semaphore *sem = container_of(osq, struct rw_semaphore, osq);
+ struct task_struct *owner = READ_ONCE(sem->owner);
+
+ return rwsem_owner_is_reader(owner) ? OWNER_NO_SPIN : owner;
+}
+
/*
* Try to acquire write lock before the writer has been put on wait queue.
*/
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+static inline bool rwsem_osq_trylock(struct optimistic_spin_queue *osq)
{
+ struct rw_semaphore *sem = container_of(osq, struct rw_semaphore, osq);
long old, count = atomic_long_read(&sem->count);

while (true) {
@@ -347,125 +358,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
}
}

-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner;
- bool ret = true;
-
- if (need_resched())
- return false;
-
- rcu_read_lock();
- owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner)) {
- /*
- * Don't spin if the rwsem is readers owned.
- */
- ret = !rwsem_owner_is_reader(owner);
- goto done;
- }
-
- /*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
- */
- ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-done:
- rcu_read_unlock();
- return ret;
-}
-
-/*
- * Return true only if we can still spin on the owner field of the rwsem.
- */
-static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner = READ_ONCE(sem->owner);
-
- if (!rwsem_owner_is_writer(owner))
- goto out;
-
- rcu_read_lock();
- while (sem->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking sem->owner still matches owner, if that fails,
- * owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- /*
- * abort spinning when need_resched or owner is not running or
- * owner's cpu is preempted.
- */
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
- rcu_read_unlock();
- return false;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-out:
- /*
- * If there is a new owner or the owner is not set, we continue
- * spinning.
- */
- return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
-}
-
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
- bool taken = false;
-
- preempt_disable();
-
- /* sem->wait_lock should not be held when doing optimistic spinning */
- if (!rwsem_can_spin_on_owner(sem))
- goto done;
-
- if (!osq_lock(&sem->osq))
- goto done;
-
- /*
- * Optimistically spin on the owner field and attempt to acquire the
- * lock whenever the owner changes. Spinning will be stopped when:
- * 1) the owning writer isn't running; or
- * 2) readers own the lock as we can't determine if they are
- * actively running or not.
- */
- while (rwsem_spin_on_owner(sem)) {
- /*
- * Try to acquire the lock
- */
- if (rwsem_try_write_lock_unqueued(sem)) {
- taken = true;
- break;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!sem->owner && (need_resched() || rt_task(current)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
- osq_unlock(&sem->osq);
-done:
- preempt_enable();
- return taken;
+ return osq_optimistic_spin(&sem->osq, rwsem_osq_get_owner,
+ rwsem_osq_trylock);
}

/*