[PATCH tip/locking/core v9 5/6] locking/pvqspinlock: Allow 1 lock stealing attempt

From: Waiman Long
Date: Fri Oct 30 2015 - 19:28:00 EST

Next message: Waiman Long: "[PATCH tip/locking/core v9 0/6] locking/qspinlock: Enhance pvqspinlock"
Previous message: Waiman Long: "[PATCH tip/locking/core v9 6/6] locking/pvqspinlock: Queue node adaptive spinning"
In reply to: Waiman Long: "[PATCH tip/locking/core v9 6/6] locking/pvqspinlock: Queue node adaptive spinning"
Next in thread: Waiman Long: "[PATCH tip/locking/core v9 1/6] locking/qspinlock: Use _acquire/_release versions of cmpxchg & xchg"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This patch allows one attempt for the lock waiter to steal the lock
when entering the PV slowpath. To prevent lock starvation, the pending
bit will be set by the queue head vCPU when it is in the active lock
spinning loop to disable any lock stealing attempt. This helps to
reduce the performance penalty caused by lock waiter preemption while
not having much of the downsides of a real unfair lock.

The pv_wait_head() function was renamed as pv_wait_head_lock() as it
was modified to acquire the lock before returning. This is necessary
because of possible lock stealing attempts from other tasks.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

Westmere Haswell
Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs
----- -------- -------- -------- --------
Before patch 3m15.6s 10m56.1s 1m44.1s 5m29.1s
After patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s

For the overcommited case (48 vCPUs), this patch is able to reduce
kernel build time by more than 54% for Westmere and 44% for Haswell.

Signed-off-by: Waiman Long <Waiman.Long@xxxxxxx>
---
kernel/locking/qspinlock.c | 52 ++++++++++-------
kernel/locking/qspinlock_paravirt.h | 111 ++++++++++++++++++++++++++++-------
kernel/locking/qspinlock_stat.h | 16 +++++
3 files changed, 136 insertions(+), 43 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index c1c8a1a..39c6a43 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
- struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }

#define pv_enabled() false

#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
-#define pv_wait_head __pv_wait_head
+#define pv_wait_head_lock __pv_wait_head_lock

#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -431,35 +432,44 @@ queue:
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
+ * The PV pv_wait_head_lock function, if active, will acquire the lock
+ * and return a non-zero value. So we have to skip the
+ * smp_load_acquire() call. As the next PV queue head hasn't been
+ * designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the redundant set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe. The cost of the
+ * redundant set_locked() call below should be negligible, too.
+ *
+ * If PV isn't active, 0 will be returned instead.
*/
- pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
- cpu_relax();
+ val = pv_wait_head_lock(lock, node);
+ if (!val) {
+ while ((val = smp_load_acquire(&lock->val.counter))
+ & _Q_LOCKED_PENDING_MASK)
+ cpu_relax();
+ /*
+ * Claim the lock now:
+ *
+ * 0,0 -> 0,1
+ */
+ set_locked(lock);
+ val |= _Q_LOCKED_VAL;
+ }

/*
* If the next pointer is defined, we are not tail anymore.
- * In this case, claim the spinlock & release the MCS lock.
*/
- if (next) {
- set_locked(lock);
+ if (next)
goto mcs_unlock;
- }

/*
- * claim the lock:
- *
- * n,0,0 -> 0,0,1 : lock, uncontended
- * *,0,0 -> *,0,1 : lock, contended
- *
* If the queue head is the only one in the queue (lock value == tail),
- * clear the tail code and grab the lock. Otherwise, we only need
- * to grab the lock.
+ * we have to clear the tail code.
*/
for (;;) {
- if (val != tail) {
- set_locked(lock);
+ if ((val & _Q_TAIL_MASK) != tail)
break;
- }
+
/*
* The smp_load_acquire() call above has provided the necessary
* acquire semantics required for locking. At most two
@@ -502,7 +512,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_lock

#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index aaeeefb..24ccc9f 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,56 @@ struct pv_node {
};

/*
+ * Allow one unfair trylock when entering the PV slowpath when the pending
+ * bit isn't set to reduce the performance impact of lock waiter preemption
+ *
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the slowpath before being
+ * queued.
+ *
+ * A little bit of unfairness here can improve performance without many
+ * of the downsides of a real unfair lock.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_trylock_unfair(l)
+static inline bool pv_queued_spin_trylock_unfair(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 0);
+}
+
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 1);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_clear_mask(&lock->val, _Q_PENDING_MASK);
+}
+
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_set_mask(&lock->val, _Q_PENDING_MASK);
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
* Include queued spinlock statistics code
*/
#include "qspinlock_stat.h"
@@ -202,8 +252,8 @@ static void pv_wait_node(struct mcs_spinlock *node)

/*
* If pv_kick_node() changed us to vcpu_hashed, retain that
- * value so that pv_wait_head() knows to not also try to hash
- * this lock.
+ * value so that pv_wait_head_lock() knows to not also try
+ * to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);

@@ -227,8 +277,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_lock(), this avoids a
+ * wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
@@ -257,10 +308,13 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
}

/*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32 pv_wait_head_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
@@ -276,11 +330,24 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
lp = (struct qspinlock **)1;

for (;; waitcnt++) {
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing. However, the pending bit check in
+ * pv_queued_spin_trylock_unfair() and the setting/clearing
+ * of pending bit here aren't memory barriers. So a cmpxchg()
+ * is used to acquire the lock to be sure.
+ */
+ set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
- if (!READ_ONCE(l->locked))
- return;
+ if (!READ_ONCE(l->locked) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ clear_pending(lock);
+ goto gotlock;
+ }
cpu_relax();
}
+ clear_pending(lock);
+

if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
@@ -296,36 +363,36 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
- * The lock is free and _Q_SLOW_VAL has never
- * been set. Therefore we need to unhash before
- * getting the lock.
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
*/
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return;
+ goto gotlock;
}
}
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);

- if (!READ_ONCE(l->locked))
- return;
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
- * wakeup and so the vCPU should wait again after spinning for
- * a while.
+ * wakeup or another vCPU has stolen the lock. The current
+ * vCPU should spin again.
*/
- qstat_inc(qstat_pv_spurious_wakeup, true);
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}

/*
- * Lock is unlocked now; the caller will acquire it without waiting.
- * As with pv_wait_node() we rely on the caller to do a load-acquire
- * for us.
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking.
*/
+gotlock:
+ return (u32)atomic_read(&lock->val);
}

/*
@@ -350,7 +417,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ * Matches the cmpxchg() in pv_wait_head_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();

diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 16b84b2..1d87065 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
* pv_latency_kick - average latency (ns) of vCPU kick operation
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_stealing - # of lock stealing operations
* pv_spurious_wakeup - # of spurious wakeups
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
* pv_wait_head - # of vCPU wait's at the queue head
@@ -43,6 +44,7 @@ enum qlock_stats {
qstat_pv_kick_wake,
qstat_pv_latency_kick,
qstat_pv_latency_wake,
+ qstat_pv_lock_stealing,
qstat_pv_spurious_wakeup,
qstat_pv_wait_again,
qstat_pv_wait_head,
@@ -66,6 +68,7 @@ static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
[qstat_pv_latency_kick] = "pv_latency_kick",
[qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
[qstat_pv_wait_again] = "pv_wait_again",
[qstat_pv_wait_head] = "pv_wait_head",
[qstat_pv_wait_node] = "pv_wait_node",
@@ -283,6 +286,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)

+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_trylock_unfair(struct qspinlock *lock)
+{
+ int ret = pv_queued_spin_trylock_unfair(lock);
+
+ qstat_inc(qstat_pv_lock_stealing, ret);
+ return ret;
+}
+#undef queued_spin_trylock
+#define queued_spin_trylock(l) qstat_trylock_unfair(l)
+
#else /* CONFIG_QUEUED_LOCK_STAT */

static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Waiman Long: "[PATCH tip/locking/core v9 0/6] locking/qspinlock: Enhance pvqspinlock"
Previous message: Waiman Long: "[PATCH tip/locking/core v9 6/6] locking/pvqspinlock: Queue node adaptive spinning"
In reply to: Waiman Long: "[PATCH tip/locking/core v9 6/6] locking/pvqspinlock: Queue node adaptive spinning"
Next in thread: Waiman Long: "[PATCH tip/locking/core v9 1/6] locking/qspinlock: Use _acquire/_release versions of cmpxchg & xchg"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]