[PATCH 6/7] locking/qspinlock: A fairer queued unfair lock

From: Waiman Long
Date: Sat Jul 11 2015 - 16:38:07 EST


For a virtual guest with the qspinlock patch, a simple unfair byte lock
will be used if PV spinlock is not configured in or the hypervisor
isn't either KVM or Xen. The byte lock works fine with small guest
of just a few vCPUs. On a much larger guest, however, byte lock can
have the following problems:

1) Lock starvation is a real possibility especially if the number
of vCPUs is large.
2) The constant reading and occasionally writing to the lock word can
put a lot of cacheline contention traffic on the affected
cacheline.

This patch introduces a queue-based unfair lock where all the vCPUs on
the queue can opportunistically steal the lock, but the frequency of
doing so decreases the further it is away from the queue head. It can
encourage a more FIFO like order of getting the lock and hence greatly
reduce the chance of lock starvation. It can also reduce cacheline
contention problem and so improve the performance of the system.

This patch has no impact on native qspinlock performance at all. The
unfair lock code will only be compiled in if CONFIG_HYPERVISOR_GUEST
is defined.

A microbenchmark of running 1 million lock-unlock operation for various
number of threads running on a KVM guest with 32 pinned vCPUs and 4
vCPUs per node (8 nodes). This microbenchmark is intended to measure
the variability of the execution times.

Kernel Threads Min/Avg/Max(ms) SD(ms)
------ ------- --------------- ------
Unfair byte lock 4 133.1/386.0/509.0 153.48
8 720.5/939.5/1,068.0 117.08
16 2,237.8/6,045.8/7,550.3 1747.37
32 5,880.2/37,028.2/44,668.7 10136.30
Unfair qspinlock 4 326.1/453.7/523.0 80.44
8 681.6/1,126.4/1,486.5 304.85
16 1,543.0/3,633.4/4,568.1 1000.47
32 2,356.8/7,103.3/7,894.9 1231.11

With small number of contending threads, both the performance and
variability of both types of unfair lock are similar. However, when
the number of contending threads increases, the byte lock has a much
higher variability than the unfair qspinlock.

Signed-off-by: Waiman Long <Waiman.Long@xxxxxx>
---
arch/x86/include/asm/qspinlock.h | 17 ++--
kernel/locking/qspinlock.c | 98 ++++++++++++++-
kernel/locking/qspinlock_unfair.h | 238 +++++++++++++++++++++++++++++++++++++
3 files changed, 340 insertions(+), 13 deletions(-)
create mode 100644 kernel/locking/qspinlock_unfair.h

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 9d51fae..bc82ace 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -39,18 +39,19 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
}
#endif

-#define virt_queued_spin_lock virt_queued_spin_lock
+#ifdef CONFIG_HYPERVISOR_GUEST
+#ifndef static_cpu_has_hypervisor
+#define static_cpu_has_hypervisor static_cpu_has(X86_FEATURE_HYPERVISOR)
+#endif

-static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#define queued_spin_trylock_unfair queued_spin_trylock_unfair
+static inline bool queued_spin_trylock_unfair(struct qspinlock *lock)
{
- if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
- return false;
-
- while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
- cpu_relax();
+ u8 *l = (u8 *)lock;

- return true;
+ return !READ_ONCE(*l) && (xchg(l, _Q_LOCKED_VAL) == 0);
}
+#endif

#include <asm-generic/qspinlock.h>

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5a25e89..65dead9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -19,7 +19,11 @@
* Peter Zijlstra <peterz@xxxxxxxxxxxxx>
*/

-#ifndef _GEN_PV_LOCK_SLOWPATH
+#if defined(_GEN_PV_LOCK_SLOWPATH) || defined(_GEN_UNFAIR_LOCK_SLOWPATH)
+#define _GEN_LOCK_SLOWPATH
+#endif
+
+#ifndef _GEN_LOCK_SLOWPATH

#include <linux/smp.h>
#include <linux/bug.h>
@@ -68,7 +72,7 @@

#include "mcs_spinlock.h"

-#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#ifdef CONFIG_HYPERVISOR_GUEST
#define MAX_NODES 8
#else
#define MAX_NODES 4
@@ -81,6 +85,7 @@
* Exactly fits one 64-byte cacheline on a 64-bit architecture.
*
* PV doubles the storage and uses the second cacheline for PV state.
+ * Unfair lock (mutually exclusive to PV) also uses the second cacheline.
*/
static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);

@@ -277,7 +282,18 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif

-#endif /* _GEN_PV_LOCK_SLOWPATH */
+#ifdef CONFIG_HYPERVISOR_GUEST
+static void unfair_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+#else
+static __always_inline void
+unfair_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { }
+#endif
+
+#ifndef static_cpu_has_hypervisor
+#define static_cpu_has_hypervisor false
+#endif
+
+#endif /* !_GEN_LOCK_SLOWPATH */

/**
* queued_spin_lock_slowpath - acquire the queued spinlock
@@ -314,8 +330,13 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
goto queue;
}

- if (virt_queued_spin_lock(lock))
+ /*
+ * Use unfair lock for VM if PV spinlock is not enabled
+ */
+ if (static_cpu_has_hypervisor) {
+ unfair_queued_spin_lock_slowpath(lock, val);
return;
+ }

/*
* wait for in-progress pending->locked hand-overs
@@ -478,7 +499,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
/*
* Generate the paravirt code for queued_spin_unlock_slowpath().
*/
-#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if !defined(_GEN_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
#define _GEN_PV_LOCK_SLOWPATH

#undef pv_enabled
@@ -496,4 +517,71 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"

+#undef _GEN_PV_LOCK_SLOWPATH
+
+#endif
+
+/*
+ * Generate the unfair lock code for queued_spin_unlock_slowpath().
+ */
+#if defined(CONFIG_HYPERVISOR_GUEST) && !defined(_GEN_LOCK_SLOWPATH)
+#define _GEN_UNFAIR_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled() true
+
+#ifdef queued_spin_lock_slowpath
+#undef queued_spin_lock_slowpath
+#endif
+#define queued_spin_lock_slowpath unfair_queued_spin_lock_slowpath
+
+#ifdef queued_spin_trylock
+#undef queued_spin_trylock
+#endif
+#define queued_spin_trylock queued_spin_trylock_unfair
+
+/*
+ * The unfair lock code is used internally and so don't need to be exported
+ */
+#undef EXPORT_SYMBOL
+#define EXPORT_SYMBOL(x)
+
+#ifdef pv_init_node
+#undef pv_init_node
+#endif
+#ifdef pv_wait_node
+#undef pv_wait_node
+#endif
+#ifdef pv_wait_head
+#undef pv_wait_head
+#endif
+
+#ifndef pv_scan_next
+#define pv_scan_next __pv_scan_next
+#endif
+#ifndef pv_pending_lock
+#define pv_pending_lock(l, v) false
+#endif
+
+#define pv_init_node unfair_init_node
+#define pv_wait_node(node, prev) \
+ do { \
+ if (unfair_wait_node(lock, node, prev, tail, old)) \
+ goto release; \
+ } while (0)
+#define pv_wait_head(lock, node) \
+ do { \
+ if (unfair_wait_head(lock, node, tail)) \
+ goto release; \
+ } while (0)
+
+#include "qspinlock_unfair.h"
+#include "qspinlock.c"
+
+#undef _GEN_UNFAIR_LOCK_SLOWPATH
+
+#endif
+
+#ifdef _GEN_LOCK_SLOWPATH
+#undef _GEN_LOCK_SLOWPATH
#endif
diff --git a/kernel/locking/qspinlock_unfair.h b/kernel/locking/qspinlock_unfair.h
new file mode 100644
index 0000000..0e8a40f
--- /dev/null
+++ b/kernel/locking/qspinlock_unfair.h
@@ -0,0 +1,238 @@
+#ifndef _GEN_UNFAIR_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+/*
+ * Unfair lock support in a virtualized guest
+ *
+ * An unfair lock can be implemented using a simple test-and-set lock like
+ * what is being done in a read-write lock. This simple scheme has 2 major
+ * problems:
+ * 1) It needs constant reading and occasionally writing to the lock word
+ * thus putting a lot of cacheline contention traffic on the affected
+ * cacheline.
+ * 2) Lock starvation is a real possibility especially if the number of
+ * virtual CPUs is large.
+ *
+ * To reduce the undesirable side effects of an unfair lock, the queue
+ * unfair spinlock implements a more elaborate scheme.
+ *
+ * Even in the wait queue, the task can still attempt to steal the lock
+ * periodically at a frequency that decreases in relation to its distance
+ * from the queue head. In other word, the closer it is to the queue head,
+ * the higher the chance it has of stealing the lock. This scheme reduces
+ * the load on the lock cacheline while trying to maintain a somewhat FIFO
+ * order of getting the lock so as to reduce the chance of lock starvation.
+ *
+ * At less than the threshold distance from the queue head, the lock stealing
+ * period increase is exponential (*2). At the threshold and beyond, the
+ * increase is linear (+LPERIOD_INC).
+ */
+#define LPERIOD_MIN_SHIFT 2
+#define LPERIOD_THRESHOLD_SHIFT 8
+#define LPERIOD_MIN (1 << LPERIOD_MIN_SHIFT)
+#define LPERIOD_THRESHOLD (1 << LPERIOD_THRESHOLD_SHIFT)
+#define LPERIOD_QHEAD (LPERIOD_MIN >> 1)
+#define LPERIOD_INC 32 /* Linear increment beyond threshold */
+
+struct uf_node {
+ struct mcs_spinlock mcs;
+ struct mcs_spinlock __res[3];
+
+ struct uf_node *prev; /* Previous node address */
+ int lsteal_period; /* Lock stealing period */
+ u32 prev_tail; /* Previous node tail code */
+};
+
+/**
+ * cmpxchg_tail - Put in the new tail code if it matches the old one
+ * @lock : Pointer to queue spinlock structure
+ * @old : The old tail code value
+ * @new : The new tail code value
+ * Return: true if operation succeeds, false otherwise
+ *
+ * The caller must have grabbed the lock before calling it and pending bit
+ * isn't used.
+ */
+static __always_inline int
+cmpxchg_tail(struct qspinlock *lock, u32 old, u32 new)
+{
+ old |= _Q_LOCKED_VAL;
+ new |= _Q_LOCKED_VAL;
+ return atomic_cmpxchg(&lock->val, old, new) == old;
+}
+
+/**
+ * Initialize the unfair part of the mcs_spinlock node.
+ * @node: Pointer to mcs_spinlock structure
+ */
+static inline void unfair_init_node(struct mcs_spinlock *node)
+{
+ struct uf_node *pn = (struct uf_node *)node;
+
+ BUILD_BUG_ON(sizeof(struct uf_node) > 5*sizeof(struct mcs_spinlock));
+
+ pn->prev = NULL;
+ pn->prev_tail = 0;
+ pn->lsteal_period = LPERIOD_QHEAD;
+}
+
+/**
+ * Wait for node->locked to become true or the lock was stolen.
+ * @lock : Pointer to queue spinlock structure
+ * @node : Pointer to mcs_spinlock structure of current node
+ * @prev : Pointer to mcs_spinlock structure of previous node
+ * @my_tail : Tail code of current node
+ * @prev_tail: Tail code of previous node
+ *
+ * Return: true if lock stolen, false if becoming queue head
+ */
+static inline bool unfair_wait_node(struct qspinlock *lock,
+ struct mcs_spinlock *node,
+ struct mcs_spinlock *prev,
+ u32 my_tail, u32 prev_tail)
+{
+ struct uf_node *next, *pn = (struct uf_node *)node;
+ int loop;
+ bool isqhead;
+
+ pn->prev = (struct uf_node *)prev;
+ pn->prev_tail = prev_tail;
+
+ for (;;) {
+ /*
+ * The spinning period of this node will be double that of the
+ * previous node if it is at less than the threshold distance
+ * from the queue head. Otherwise, it will spin a fixed
+ * additional amount thatn the previous one.
+ */
+ u32 period = READ_ONCE(pn->prev->lsteal_period);
+
+ pn->lsteal_period = (period < LPERIOD_THRESHOLD)
+ ? period * 2 : period + LPERIOD_INC;
+ for (loop = pn->lsteal_period; loop; loop--) {
+ /*
+ * The caller will do a load-acquire for us.
+ */
+ if (READ_ONCE(node->locked))
+ return false;
+ cpu_relax();
+ }
+
+ /*
+ * Attempt to steal the lock
+ */
+ if (queued_spin_trylock_unfair(lock))
+ break; /* Got the lock */
+ }
+
+ /*
+ * Have stolen the lock, need to remove itself from the wait queue.
+ * There are 3 steps that need to be done:
+ * 1) If it is at the end of the queue, change the tail code in the
+ * lock to the one before it. If the current node has become the
+ * queue head, the previous tail code is 0.
+ * 2) Change the next pointer in the previous queue node to point
+ * to the next one in queue or NULL if it is at the end of queue.
+ * 3) If a next node is present, copy the prev_tail and prev values
+ * to the next node.
+ *
+ * Note that no forward progress is possible in other nodes as the
+ * lock has just been acquired.
+ */
+ isqhead = READ_ONCE(pn->mcs.locked);
+ prev_tail = isqhead ? 0 : pn->prev_tail;
+
+ if (isqhead)
+ pn->lsteal_period = LPERIOD_QHEAD;
+
+ /* Step 1 */
+ if ((atomic_read(&lock->val) == (my_tail | _Q_LOCKED_VAL)) &&
+ cmpxchg_tail(lock, my_tail, prev_tail)) {
+ /*
+ * Successfully change the tail code back to the
+ * previous one. Now need to clear the next pointer
+ * in the previous node only if it contains my queue
+ * node address and is not the queue head.
+ *
+ * The cmpxchg() call below may fail if a new task
+ * comes along and put its node address into the next
+ * pointer. Whether the operation succeeds or fails
+ * doesn't really matter.
+ */
+ /* Step 2 */
+ if (!isqhead)
+ (void)cmpxchg(&pn->prev->mcs.next, &pn->mcs, NULL);
+ pn->mcs.next = NULL;
+ return true;
+ }
+
+ /*
+ * Step 3
+ * A next node has to be present if the lock has a different
+ * tail code. So wait until the next pointer is set.
+ */
+ while (!(next = (struct uf_node *)READ_ONCE(node->next)))
+ cpu_relax();
+ if (isqhead) {
+ struct mcs_spinlock *nxt = (struct mcs_spinlock *)next;
+
+ arch_mcs_spin_unlock_contended(&nxt->locked);
+ return true; /* Done for queue head */
+ }
+
+ WRITE_ONCE(pn->prev->mcs.next, (struct mcs_spinlock *)next);
+
+ /*
+ * Need to make sure that both prev and prev_tail of the next node
+ * are set up before modifying them.
+ */
+ while (!(READ_ONCE(next->prev) && READ_ONCE(next->prev_tail)))
+ cpu_relax();
+ WRITE_ONCE(next->prev, pn->prev);
+ WRITE_ONCE(next->prev_tail, pn->prev_tail);
+
+ /*
+ * A smp_wmb() is inserted here to make sure all the new unfair node
+ * information in next will be visible before unlock which is not
+ * a full memory barrier.
+ *
+ * It matches the atomic instruction in queued_spin_lock_unfair().
+ */
+ smp_wmb();
+ return true;
+}
+
+/**
+ * As queue head, attempt to acquire the lock in a normal spin loop.
+ * @lock: Pointer to queue spinlock structure
+ * @node: Pointer to mcs_spinlock structure of current node
+ * @tail: Tail code of current node
+ *
+ * Return: true is always returned
+ */
+static inline bool
+unfair_wait_head(struct qspinlock *lock, struct mcs_spinlock *node, u32 tail)
+{
+ struct uf_node *pn = (struct uf_node *)node;
+ struct mcs_spinlock *next;
+
+ pn->lsteal_period = LPERIOD_QHEAD;
+ while (!queued_spin_trylock_unfair(lock))
+ cpu_relax();
+
+ /*
+ * Remove tail code in the lock if it is the only one in the queue
+ */
+ if ((atomic_read(&lock->val) == (tail | _Q_LOCKED_VAL)) &&
+ cmpxchg_tail(lock, tail, 0))
+ goto release;
+
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+
+release:
+ return true;
+}
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/