Re: [RFC][PATCH 3/3] locking/qspinlock: Optimize for x86

From: Peter Zijlstra
Date: Tue Oct 02 2018 - 10:14:33 EST


On Tue, Oct 02, 2018 at 02:19:53PM +0100, Will Deacon wrote:
> On Mon, Oct 01, 2018 at 10:00:28PM +0200, Peter Zijlstra wrote:

> > Let me draw a picture of that..
> >
> >
> > CPU0 CPU1 CPU2 CPU3
> >
> > 0) lock
> > trylock -> (0,0,1)
> > 1)lock
> > trylock /* fail */
> >
> > 2) lock
> > trylock /* fail */
> > tas-pending -> (0,1,1)
> > wait-locked
> >
> > 3) lock
> > trylock /* fail */
> > tas-pending /* fail */
> >
> > 4) unlock -> (0,1,0)
> > clr_pnd_set_lck -> (0,0,1)
> > unlock -> (0,0,0)
> >
> > 5) tas-pending -> (0,1,0)
> > read-val -> (0,1,0)
> > 6) clr_pnd_set_lck -> (0,0,1)
> > 7) xchg_tail -> (n,0,1)
> > load_acquire <- (n,0,0) (from-4)
> > 8) cmpxchg /* fail */
> > set_locked()
> >
> > > Is there something I'm missing that means this can't happen? I suppose
> > > cacheline granularity ends up giving serialisation between (4) and (7),
> > > but I'd *much* prefer not to rely on that because it feels horribly
> > > fragile.
> >
> > Well, on x86 atomics are fully ordered, so the xchg_tail() does in
> > fact have smp_mb() in and that should order it sufficient for that not
> > to happen I think.
>
> Hmm, does that actually help, though? I still think you're relying on the
> cache-coherence protocol to serialise the xchg() on pending before the
> xchg_tail(), which I think is fragile because they don't actually overlap.

Maybe, I suspect TSO makes it work, but see the below alternative.

---
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -6,9 +6,29 @@
#include <asm/cpufeature.h>
#include <asm-generic/qspinlock_types.h>
#include <asm/paravirt.h>
+#include <asm/rmwcc.h>

#define _Q_PENDING_LOOPS (1 << 9)

+static __always_inline bool __test_and_set_pending(struct qspinlock *lock)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX "btsl",
+ lock->val.counter, "Ir", _Q_PENDING_OFFSET, "%0", c);
+}
+
+#define queued_set_pending_fetch_acquire queued_set_pending_fetch_acquire
+static inline u32 queued_set_pending_fetch_acquire(struct qspinlock *lock)
+{
+ u32 val = 0;
+
+ if (__test_and_set_pending(lock))
+ val |= _Q_PENDING_VAL;
+
+ val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
+
+ return val;
+}
+
#ifdef CONFIG_PARAVIRT_SPINLOCKS
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_init_lock_hash(void);
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -232,6 +232,20 @@ static __always_inline u32 xchg_tail(str
#endif /* _Q_PENDING_BITS == 8 */

/**
+ * queued_set_pending_fetch_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_set_pending_fetch_acquire
+static __always_inline u32 queued_set_pending_fetch_acquire(struct qspinlock *lock)
+{
+ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
* set_locked - Set the lock bit and own the lock
* @lock: Pointer to queued spinlock structure
*
@@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qs
*
* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
*/
- val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+ val = queued_set_pending_fetch_acquire(lock);

/*
* If we observe contention, there is a concurrent locker.