Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to pv_wait()

From: Peter Zijlstra
Date: Wed Apr 20 2016 - 08:08:22 EST


On Thu, Apr 14, 2016 at 02:41:58PM -0400, Waiman Long wrote:
> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
> to help the porting of pvqspinlock to PPC. The new argument will can
> potentially help hypervisor expediate the execution of the critical
> section so that the lock holder vCPU can release the lock sooner.
>
> This patch does just that by storing the previous node vCPU number.
> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
> number as it is likely to be the lock holder.
>
> In pv_wait_node(), the newly added pv_lookup_hash() function will
> be called to look up the queue head and pass in the lock holder vCPU
> number stored there.
>
> This patch introduces negligible overhead to the current pvqspinlock
> code. The extra lockcpu argument isn't currently used in x86
> architecture.

This Changelog is completely useless; it does not explain how this
works at all.


> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ce2f75e..99f31e4 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
> */
>
> static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
> + struct mcs_spinlock *node,
> struct mcs_spinlock *prev) { }
> static __always_inline void __pv_kick_node(struct qspinlock *lock,
> struct mcs_spinlock *node) { }
> @@ -407,7 +408,7 @@ queue:
> prev = decode_tail(old);
> WRITE_ONCE(prev->next, node);
>
> - pv_wait_node(node, prev);
> + pv_wait_node(lock, node, prev);
> arch_mcs_spin_lock_contended(&node->locked);
>
> /*
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index 21ede57..895224e 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -51,6 +51,7 @@ struct pv_node {
> struct mcs_spinlock __res[3];
>
> int cpu;
> + int prev_cpu; /* Previous node cpu */

That is a horrible name; what is a 'node cpu'.

> u8 state;
> };
>
> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
> * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
> *
> * Since we should not be holding locks from NMI context (very rare indeed) the
> - * max load factor is 0.75, which is around the point where open addressing
> - * breaks down.
> + * max load factor is 0.75.

Why? Isn't that true anymore?

> *
> */
> struct pv_hash_entry {
> @@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
> }
>
> /*
> + * Look up the given lock in the hash table
> + * Return the pv_node if found, NULL otherwise
> + */
> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
> +{
> + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> + struct pv_hash_entry *he;
> +
> + for_each_hash_entry(he, offset, hash) {
> + struct qspinlock *l = READ_ONCE(he->lock);
> +
> + if (l == lock)

The other loop writes:

if (READ_ONCE(he->lock) == lock)

> + return READ_ONCE(he->node);
> + /*
> + * Presence of an empty slot signal the end of search. We
> + * may miss the entry, but that will limit the amount of
> + * time doing the search when the desired entry isn't there.
> + */
> + else if (!l)
> + break;

That 'else' is entirely pointless. Also, why isn't this: return NULL;

> + }
> + return NULL;

and this BUG() ?

> +}
> +
> +/*
> * Return true if when it is time to check the previous node which is not
> * in a running state.
> */
> @@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>
> pn->cpu = smp_processor_id();
> pn->state = vcpu_running;
> + pn->prev_cpu = -1;

This does not match the struct element order.

> }
>
> /*
> @@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
> * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
> * behalf.
> */
> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
> + struct mcs_spinlock *prev)
> {
> struct pv_node *pn = (struct pv_node *)node;
> struct pv_node *pp = (struct pv_node *)prev;
> @@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> int loop;
> bool wait_early;
>
> + pn->prev_cpu = pp->cpu; /* Save previous node vCPU */

again a useless comment.

> +
> /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
> for (;; waitcnt++) {
> for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
> @@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> smp_store_mb(pn->state, vcpu_halted);
>
> if (!READ_ONCE(node->locked)) {
> + struct pv_node *hn;
> +
> qstat_inc(qstat_pv_wait_node, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> qstat_inc(qstat_pv_wait_early, wait_early);
> - pv_wait(&pn->state, vcpu_halted);
> +
> + /*
> + * We try to locate the queue head pv_node by looking
> + * up the hash table. If it is not found, use the
> + * CPU in the previous node instead.
> + */
> + hn = pv_lookup_hash(lock);
> + if (!hn)
> + hn = pn;

This is potentially expensive... it does not explain why this lookup can
fail etc.. nor mentioned that lock stealing caveat.

> + pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
> }
>
> /*
> @@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
> WRITE_ONCE(pn->state, vcpu_halted);
> qstat_inc(qstat_pv_wait_head, true);
> qstat_inc(qstat_pv_wait_again, waitcnt);
> - pv_wait(&l->locked, _Q_SLOW_VAL);
> +
> + /*
> + * Pass in the previous node vCPU nmber which is likely to be
> + * the lock holder vCPU. This additional information may help
> + * the hypervisor to give more resource to that vCPU so that
> + * it can release the lock faster. With lock stealing,
> + * however, that vCPU may not be the actual lock holder.
> + */
> + pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);


urgh..


With all the holes in, does it really still matter?

In any case, I would really only want to see this together with the
patches that make use of it, and then still have it have numbers with
and without this thing.