I was thinking of something simpler:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9b8e30dd5b9b..455c648f9adc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3198,10 +3198,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
struct kvm_vcpu *vcpu;
- int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
int yielded = 0;
int try = 3;
- int pass;
+ int pass, num_passes = 1;
int i;
kvm_vcpu_set_in_spin_loop(me, true);
@@ -3212,13 +3211,14 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
* VCPU is holding the lock that we need and will release it.
* We approximate round-robin by starting at the last boosted VCPU.
*/
- for (pass = 0; pass < 2 && !yielded && try; pass++) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!pass && i <= last_boosted_vcpu) {
- i = last_boosted_vcpu;
- continue;
- } else if (pass && i > last_boosted_vcpu)
- break;
+ for (pass = 0; pass < num_passes; pass++) {
+ int idx = me->kvm->last_boosted_vcpu;
+ int n = atomic_read(&kvm->online_vcpus);
+ for (i = 0; i < n; i++, idx++) {
+ if (idx == n)
+ idx = 0;
+
+ vcpu = kvm_get_vcpu(kvm, idx);
if (!READ_ONCE(vcpu->ready))
continue;
if (vcpu == me)
@@ -3226,23 +3226,36 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
if (rcuwait_active(&vcpu->wait) &&
!vcpu_dy_runnable(vcpu))
continue;
- if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
- !kvm_arch_vcpu_in_kernel(vcpu))
- continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
+ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
+ !kvm_arch_vcpu_in_kernel(vcpu)) {
+ /*
+ * A vCPU running in userspace can get to kernel mode via
+ * an interrupt. That's a worse choice than a CPU already
+ * in kernel mode so only do it on a second pass.
+ */
+ if (!vcpu_dy_runnable(vcpu))
+ continue;
+ if (pass == 0) {
+ num_passes = 2;
+ continue;
+ }
+ }
+
yielded = kvm_vcpu_yield_to(vcpu);
if (yielded > 0) {
kvm->last_boosted_vcpu = i;
- break;
+ goto done;
} else if (yielded < 0) {
try--;
if (!try)
- break;
+ goto done;
}
}
}
+done:
We just tested the above post against 96 vCPUs VM in an over-subscribe
scenario, the score of pbzip2 fluctuated drastically. Sometimes it is
worse than vanilla, but the average improvement is around 2.2%. The
new version of my post is around 9.3%,the origial posted patch is
around 10% which is totally as expected since now both IPI receivers
in user-mode and lock-waiters are second class citizens.