[PATCH 3/4] KVM host implementation

From: Tianqiang Xu
Date: Mon Aug 30 2021 - 22:01:11 EST


Host OS sets 'is_idle' field of kvm_steal_time to 1 if
cpu_rq(this_cpu)->nr_running is 1 before a vCPU being scheduled out.
On this condition, there is no other task on this pCPU to run.
Thus, is_idle == 1 means the pCPU where the preempted vCPU most
recently run is idle.

Host OS invokes get_cpu_nr_running() to get the value of
cpu_rq(this_cpu)->nr_running.

--
Authors: Tianqiang Xu, Dingji Li, Zeyu Mi
Shanghai Jiao Tong University

Signed-off-by: Tianqiang Xu <skyele@xxxxxxxxxxx>
---
arch/x86/include/asm/qspinlock.h | 1 -
arch/x86/kvm/x86.c | 88 +++++++++++++++++++++++++++++++-
2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index c32f2eb6186c..1832dd8308ca 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -61,7 +61,6 @@ static inline bool vcpu_is_preempted(long cpu)
{
return pv_vcpu_is_preempted(cpu);
}
-#endif

#define pcpu_is_idle pcpu_is_idle
static inline bool pcpu_is_idle(long cpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5d5c5ed7dd4..1fb1ab3d6fca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3181,6 +3181,72 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}

+static void kvm_steal_time_set_is_idle(struct kvm_vcpu *vcpu)
+{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.is_idle)
+ return;
+
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->is_idle = vcpu->arch.st.is_idle = KVM_PCPU_IS_IDLE;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+}
+
+static void kvm_steal_time_clear_is_idle(struct kvm_vcpu *vcpu)
+{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.is_idle)
+ return;
+
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, false))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH))
+ xchg(&st->is_idle, 0);
+ else
+ st->is_idle = 0;
+
+ vcpu->arch.st.is_idle = 0;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+}
+
+
+static DEFINE_PER_CPU(struct kvm_vcpu *, this_cpu_pre_run_vcpu);
+
+static void vcpu_load_update_pre_vcpu_callback(struct kvm_vcpu *new_vcpu, struct kvm_steal_time *st)
+{
+ struct kvm_vcpu *old_vcpu = __this_cpu_read(this_cpu_pre_run_vcpu);
+
+ if (!old_vcpu)
+ return;
+ if (old_vcpu != new_vcpu)
+ kvm_steal_time_clear_is_idle(old_vcpu);
+ else
+ st->is_idle = new_vcpu->arch.st.is_idle = KVM_PCPU_IS_IDLE;
+}
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
@@ -3219,6 +3285,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)

vcpu->arch.st.preempted = 0;

+ vcpu_load_update_pre_vcpu_callback(vcpu, st);
+
if (st->version & 1)
st->version += 1; /* first time write, random junk */

@@ -4290,6 +4358,8 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
}

+extern int get_cpu_nr_running(int cpu);
+
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
@@ -4304,8 +4374,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (kvm_xen_msr_enabled(vcpu->kvm))
kvm_xen_runstate_set_preempted(vcpu);
- else
+ else {
kvm_steal_time_set_preempted(vcpu);
+
+ if (get_cpu_nr_running(smp_processor_id()) <= 1)
+ kvm_steal_time_set_is_idle(vcpu);
+ else
+ kvm_steal_time_clear_is_idle(vcpu);
+ }
srcu_read_unlock(&vcpu->kvm->srcu, idx);

static_call(kvm_x86_vcpu_put)(vcpu);
@@ -9693,6 +9769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_enable();
preempt_enable();

+ __this_cpu_write(this_cpu_pre_run_vcpu, vcpu);
+
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

/*
@@ -11253,6 +11331,14 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)

void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ int cpu;
+ struct kvm_vcpu *vcpu;
+
+ for_each_possible_cpu(cpu) {
+ vcpu = per_cpu(this_cpu_pre_run_vcpu, cpu);
+ if (vcpu && vcpu->kvm == kvm)
+ per_cpu(this_cpu_pre_run_vcpu, cpu) = NULL;
+ }
+
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
--
2.26.0