[PATCH 1/7] locking/pvqspinlock: Only kick CPU at unlock time

From: Waiman Long
Date: Sat Jul 11 2015 - 16:38:36 EST


For an over-committed guest with more vCPUs than physical CPUs
available, it is possible that a vCPU may be kicked twice before
getting the lock - one before it becomes queue head and once before
it gets the lock. All these CPU kicking and halting (VMEXIT) can be
expensive and slow down system performance.

This patch adds a new vCPU state (vcpu_hashed) which enables the code
to delay CPU kicking until at unlock time. Once this state is set,
the new lock holder will set _Q_SLOW_VAL and fill in the hash table
on behalf of the halted queue head vCPU. The original vcpu_halted
state will be used by pv_wait_node() only to differentiate other
queue nodes from the qeue head.

Signed-off-by: Waiman Long <Waiman.Long@xxxxxx>
---
kernel/locking/qspinlock.c | 10 ++--
kernel/locking/qspinlock_paravirt.h | 83 ++++++++++++++++++++++++++---------
2 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c4920..d2e0fc1 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)

static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_scan_next(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node) { }

@@ -248,7 +248,7 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock,

#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
-#define pv_kick_node __pv_kick_node
+#define pv_scan_next __pv_scan_next
#define pv_wait_head __pv_wait_head

#ifdef CONFIG_PARAVIRT_SPINLOCKS
@@ -440,7 +440,7 @@ queue:
cpu_relax();

arch_mcs_spin_unlock_contended(&next->locked);
- pv_kick_node(next);
+ pv_scan_next(lock, next);

release:
/*
@@ -461,7 +461,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);

#undef pv_init_node
#undef pv_wait_node
-#undef pv_kick_node
+#undef pv_scan_next
#undef pv_wait_head

#undef queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 04ab181..d302c39 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -21,9 +21,14 @@

#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)

+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
enum vcpu_state {
vcpu_running = 0,
- vcpu_halted,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};

struct pv_node {
@@ -152,7 +157,8 @@ static void pv_init_node(struct mcs_spinlock *node)

/*
* Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_scan_next() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
{
@@ -171,9 +177,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
*
* [S] pn->state = vcpu_halted [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_running
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
*
- * Matches the xchg() from pv_kick_node().
+ * Matches the cmpxchg() from pv_scan_next().
*/
smp_store_mb(pn->state, vcpu_halted);

@@ -181,9 +187,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
pv_wait(&pn->state, vcpu_halted);

/*
- * Reset the vCPU state to avoid unncessary CPU kicking
+ * Reset the state except when vcpu_hashed is set.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);

/*
* If the locked flag is still not set after wakeup, it is a
@@ -193,6 +199,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* MCS lock will be released soon.
*/
}
+
/*
* By now our node->locked should be 1 and our caller will not actually
* spin-wait for it. We do however rely on our caller to do a
@@ -201,24 +208,32 @@ static void pv_wait_node(struct mcs_spinlock *node)
}

/*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 & lock acquired.
+ * Check if the the vCPU has been halted. If so, set the _Q_SLOW_VAL flag
+ * and put an entry into the lock hash table to be waken up at unlock time.
*/
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_scan_next(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;

/*
- * Note that because node->locked is already set, this actual
- * mcs_spinlock entry could be re-used already.
- *
- * This should be fine however, kicking people for no reason is
- * harmless.
+ * Transition vCPU state: halted => hashed
+ * Quit if the transition failed.
*
- * See the comment in pv_wait_node().
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
*/
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
- pv_kick(pn->cpu);
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table & set the _Q_SLOW_VAL in the lock.
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
+ */
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
}

/*
@@ -229,19 +244,42 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
- struct qspinlock **lp = NULL;
+ struct qspinlock **lp;
int loop;

+ /*
+ * Initialize lp to a non-NULL value if it has already been in the
+ * pv_hashed state so that pv_hash() won't be called again.
+ */
+ lp = (READ_ONCE(pn->state) == vcpu_hashed) ? (struct qspinlock **)1
+ : NULL;
for (;;) {
+ WRITE_ONCE(pn->state, vcpu_running);
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
return;
cpu_relax();
}

- WRITE_ONCE(pn->state, vcpu_halted);
+ /*
+ * Recheck lock value after setting vcpu_hashed state
+ *
+ * [S] state = vcpu_hashed [S] l->locked = 0
+ * MB MB
+ * [L] l->locked [L] state == vcpu_hashed
+ *
+ * Matches smp_store_mb() in __pv_queued_spin_unlock()
+ */
+ smp_store_mb(pn->state, vcpu_hashed);
+
+ if (!READ_ONCE(l->locked)) {
+ WRITE_ONCE(pn->state, vcpu_running);
+ return;
+ }
+
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
+
/*
* lp must be set before setting _Q_SLOW_VAL
*
@@ -305,13 +343,16 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* Now that we have a reference to the (likely) blocked pv_node,
* release the lock.
*/
- smp_store_release(&l->locked, 0);
+ smp_store_mb(l->locked, 0);

/*
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
*/
- if (READ_ONCE(node->state) == vcpu_halted)
+ if (READ_ONCE(node->state) == vcpu_hashed)
pv_kick(node->cpu);
}
/*
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/