[PATCH RFC 3/7] KVM: timer: synchronize tsc-deadline timestamp for guest

From: Quan Xu
Date: Fri Dec 08 2017 - 03:40:53 EST


From: Ben Luo <bn0418@xxxxxxxxx>

In general, KVM guest programs tsc-deadline timestamp to
MSR_IA32_TSC_DEADLINE MSR. This will cause a VM-exit, and
then KVM handles this timer for guest.

The tsc-deadline timestamp is mostly recorded in share page
with less VM-exit. We Introduce a periodically working kthread
to scan share page and synchronize timer setting for guest
on a dedicated CPU.

Signed-off-by: Yang Zhang <yang.zhang.wz@xxxxxxxxx>
Signed-off-by: Quan Xu <quan.xu0@xxxxxxxxx>
Signed-off-by: Ben Luo <bn0418@xxxxxxxxx>
---
arch/x86/kvm/lapic.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/lapic.h | 5 ++
2 files changed, 143 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 55c9ba3..20a23bb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -36,6 +36,10 @@
#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
+#include <linux/ktime.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mmu_context.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#include "trace.h"
@@ -70,6 +74,12 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul

+static struct hrtimer pv_sync_timer;
+static long pv_timer_period_ns = PVTIMER_PERIOD_NS;
+static struct task_struct *pv_timer_polling_worker;
+
+module_param(pv_timer_period_ns, long, 0644);
+
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -2542,8 +2552,130 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
}
}

+static enum hrtimer_restart pv_sync_timer_callback(struct hrtimer *timer)
+{
+ hrtimer_forward_now(timer, ns_to_ktime(pv_timer_period_ns));
+ wake_up_process(pv_timer_polling_worker);
+
+ return HRTIMER_RESTART;
+}
+
+void kvm_apic_sync_pv_timer(void *data)
+{
+ struct kvm_vcpu *vcpu = data;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ unsigned long flags, this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ u64 guest_tsc, expire_tsc;
+ long rem_tsc;
+
+ if (!lapic_in_kernel(vcpu) || !pv_timer_enabled(vcpu))
+ return;
+
+ local_irq_save(flags);
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ rem_tsc = ktime_to_ns(hrtimer_get_remaining(&pv_sync_timer))
+ * this_tsc_khz;
+ if (rem_tsc <= 0)
+ rem_tsc += pv_timer_period_ns * this_tsc_khz;
+ do_div(rem_tsc, 1000000L);
+
+ /*
+ * make sure guest_tsc and rem_tsc are assigned before to update
+ * next_sync_tsc.
+ */
+ smp_wmb();
+ kvm_xchg_guest_cached(vcpu->kvm, &vcpu->arch.pv_timer.data,
+ offsetof(struct pvtimer_vcpu_event_info, next_sync_tsc),
+ guest_tsc + rem_tsc, 8);
+
+ /* make sure next_sync_tsc is visible */
+ smp_wmb();
+
+ expire_tsc = kvm_xchg_guest_cached(vcpu->kvm, &vcpu->arch.pv_timer.data,
+ offsetof(struct pvtimer_vcpu_event_info, expire_tsc),
+ 0UL, 8);
+
+ /* make sure expire_tsc is visible */
+ smp_wmb();
+
+ if (expire_tsc) {
+ if (expire_tsc > guest_tsc)
+ /*
+ * As we bind this thread to a dedicated CPU through
+ * IPI, the timer is registered on that dedicated
+ * CPU here.
+ */
+ kvm_set_lapic_tscdeadline_msr(apic->vcpu, expire_tsc);
+ else
+ /* deliver immediately if expired */
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ }
+ local_irq_restore(flags);
+}
+
+static int pv_timer_polling(void *arg)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ mm_segment_t oldfs = get_fs();
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ spin_lock(&kvm_lock);
+ __set_current_state(TASK_RUNNING);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ set_fs(USER_DS);
+ use_mm(kvm->mm);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_apic_sync_pv_timer(vcpu);
+ }
+ unuse_mm(kvm->mm);
+ set_fs(oldfs);
+ }
+
+ spin_unlock(&kvm_lock);
+
+ schedule();
+ }
+
+ return 0;
+}
+
+static void kvm_pv_timer_init(void)
+{
+ ktime_t ktime = ktime_set(0, pv_timer_period_ns);
+
+ hrtimer_init(&pv_sync_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ pv_sync_timer.function = &pv_sync_timer_callback;
+
+ /* kthread for pv_timer sync buffer */
+ pv_timer_polling_worker = kthread_create(pv_timer_polling, NULL,
+ "pv_timer_polling_worker/%d",
+ PVTIMER_SYNC_CPU);
+ if (IS_ERR(pv_timer_polling_worker)) {
+ pr_warn_once("kvm: failed to create thread for pv_timer\n");
+ pv_timer_polling_worker = NULL;
+ hrtimer_cancel(&pv_sync_timer);
+
+ return;
+ }
+
+ kthread_bind(pv_timer_polling_worker, PVTIMER_SYNC_CPU);
+ wake_up_process(pv_timer_polling_worker);
+ hrtimer_start(&pv_sync_timer, ktime, HRTIMER_MODE_REL);
+}
+
void kvm_lapic_init(void)
{
+ kvm_pv_timer_init();
+
/* do not patch jump label more than once per second */
jump_label_rate_limit(&apic_hw_disabled, HZ);
jump_label_rate_limit(&apic_sw_disabled, HZ);
@@ -2551,6 +2683,12 @@ void kvm_lapic_init(void)

void kvm_lapic_exit(void)
{
+ if (pv_timer_polling_worker) {
+ hrtimer_cancel(&pv_sync_timer);
+ kthread_stop(pv_timer_polling_worker);
+ pv_timer_polling_worker = NULL;
+ }
+
static_key_deferred_flush(&apic_hw_disabled);
static_key_deferred_flush(&apic_sw_disabled);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 539a738..4588d59 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,6 +16,9 @@
#define APIC_BUS_CYCLE_NS 1
#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)

+#define PVTIMER_SYNC_CPU (NR_CPUS - 1) /* dedicated CPU */
+#define PVTIMER_PERIOD_NS 250000L /* pvtimer default period */
+
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
@@ -213,6 +216,8 @@ static inline bool pv_timer_enabled(struct kvm_vcpu *vcpu)
return vcpu->arch.pv_timer.msr_val & KVM_MSR_ENABLED;
}

+void kvm_apic_sync_pv_timer(void *data);
+
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);

void wait_lapic_expire(struct kvm_vcpu *vcpu);
--
1.7.1