Re: [PATCH] locking/osq: Drop the overload of osq lock

From: Peter Zijlstra
Date: Mon Jun 27 2016 - 04:10:19 EST


On Mon, Jun 27, 2016 at 02:45:06PM +0800, Boqun Feng wrote:
> +++ b/include/linux/vcpu_preempt.h
> @@ -0,0 +1,82 @@
> +/*
> + * Primitives for checking the vcpu preemption from the guest.
> + */
> +
> +static long __vcpu_preempt_count(void)
> +{
> + return 0;
> +}
> +
> +static bool __vcpu_has_preempted(long vpc)
> +{
> + return false;
> +}
> +
> +static bool __vcpu_is_preempted(int cpu)
> +{
> + return false;
> +}
> +
> +struct vcpu_preempt_ops {
> + /*
> + * Get the current vcpu's "preempt count", which is going to use for
> + * checking whether the current vcpu has ever been preempted
> + */
> + long (*preempt_count)(void);
> +
> + /*
> + * Return whether a vcpu is preempted
> + */
> + bool (*is_preempted)(int cpu);
> +
> + /*
> + * Given a "vcpu preempt count", Return whether a vcpu preemption ever
> + * happened after the .preempt_count() was called.
> + */
> + bool (*has_preempted)(long vpc);
> +};
> +
> +extern struct vcpu_preempt_ops vcpu_preempt_ops;
> +
> +/* Default boilerplate */
> +#define DEFAULT_VCPU_PREEMPT_OPS \
> + { \
> + .preempt_count = __vcpu_preempt_count, \
> + .is_preempted = __vcpu_is_preempted, \
> + .has_preempted = __vcpu_has_preempted \
> + }
> +
> +#ifdef CONFIG_HAS_VCPU_PREEMPTION_DETECTION
> +/*
> + * vcpu_preempt_count: Get the current cpu's "vcpu preempt count"(vpc).
> + *
> + * The vpc is used for checking whether the current vcpu has ever been
> + * preempted via vcpu_has_preempted().
> + *
> + * This function and vcpu_has_preepmted() should be called in the same
> + * preemption disabled critical section.
> + */
> +#define vcpu_preempt_count() vcpu_preempt_ops.preempt_count()
> +
> +/*
> + * vcpu_is_preempted: Check whether @cpu's vcpu is preempted.
> + */
> +#define vcpu_is_preempted(cpu) vcpu_preempt_ops.is_preempted(cpu)
> +
> +/*
> + * vcpu_has_preepmted: Check whether the current cpu's vcpu has ever been
> + * preempted.
> + *
> + * The checked duration is between the vcpu_preempt_count() which returns @vpc
> + * is called and this function called.
> + *
> + * This function and corresponding vcpu_preempt_count() should be in the same
> + * preemption disabled cirtial section.
> + */
> +#define vcpu_has_preempted(vpc) vcpu_preempt_ops.has_preempted(vpc)
> +
> +#else /* CONFIG_HAS_VCPU_PREEMPTION_DETECTION */
> +#define vcpu_preempt_count() __vcpu_preempt_count()
> +#define vcpu_is_preempted(cpu) __vcpu_is_preempted(cpu)
> +#define vcpu_has_preempted(vpc) __vcpu_has_preempted(vpc)
> +#endif /* CONFIG_HAS_VCPU_PREEPMTION_DETECTION */

No, this is entirely insane, also broken.

No vectors, no actual function calls, nothing like that. You want the
below to completely compile away and generate the exact 100% same code
it does today.

> +++ b/kernel/locking/osq_lock.c
> @@ -1,6 +1,7 @@
> #include <linux/percpu.h>
> #include <linux/sched.h>
> #include <linux/osq_lock.h>
> +#include <linux/vcpu_preempt.h>
>
> /*
> * An MCS like lock especially tailored for optimistic spinning for sleeping
> @@ -87,6 +88,8 @@ bool osq_lock(struct optimistic_spin_queue *lock)
> struct optimistic_spin_node *prev, *next;
> int curr = encode_cpu(smp_processor_id());
> int old;
> + int loops;
> + long vpc;
>
> node->locked = 0;
> node->next = NULL;
> @@ -106,6 +109,9 @@ bool osq_lock(struct optimistic_spin_queue *lock)
> node->prev = prev;
> WRITE_ONCE(prev->next, node);
>
> + old = old - 1;

That's just nasty, and could result in an unconditional decrement being
issues, even though its never used.

> + vpc = vcpu_preempt_count();
> +
> /*
> * Normally @prev is untouchable after the above store; because at that
> * moment unlock can proceed and wipe the node element from stack.
> @@ -118,8 +124,14 @@ bool osq_lock(struct optimistic_spin_queue *lock)
> while (!READ_ONCE(node->locked)) {
> /*
> * If we need to reschedule bail... so we can block.
> + * An over-committed guest with more vCPUs than pCPUs
> + * might fall in this loop and cause a huge overload.
> + * This is because vCPU A(prev) hold the osq lock and yield out,
> + * vCPU B(node) wait ->locked to be set, IOW, wait till
> + * vCPU A run and unlock the osq lock.
> + * NOTE that vCPU A and vCPU B might run on same physical cpu.
> */
> - if (need_resched())
> + if (need_resched() || vcpu_is_preempted(old) || vcpu_has_preempted(vpc))
> goto unqueue;
>
> cpu_relax_lowlatency();
>