[RFC PATCH] x86, kvm: use kvmclock to compute TSC deadline value

From: Paolo Bonzini
Date: Tue Jul 05 2016 - 13:37:14 EST


Bad things happen if a guest using the TSC deadline timer is migrated.
The guest doesn't re-calibrate the TSC after migration, and the
TSC frequency can and will change unless your processor supports TSC
scaling (on Intel this is only Skylake) or your data center is perfectly
homogeneous.

The solution in this patch is to skip tsc_khz, and instead derive the
frequency from kvmclock's (mult, shift) pair. Because kvmclock
parameters convert from tsc to nanoseconds, this needs a division
but that's the least of our problems when the TSC_DEADLINE_MSR write
costs 2000 clock cycles. Luckily tsc_khz is really used by very little
outside the tsc clocksource (which kvmclock replaces) and the TSC
deadline timer.

This patch does not handle the very first deadline, hoping that it
is masked by the migration downtime (i.e. that the timer fires late
anyway). I'd like a remark on the approach in general and ideas on how
to handle the first deadline. It's also possible to just blacklist the
TSC deadline timer of course, and it's probably the best thing to do for
stable kernels. It would require extending to other modes the
implementation of preemption-timer based APIC timer. It'd be a pity to
lose the nice latency boost that the preemption timer offers.

The patch is also quite ugly in the way it arranges for kvmclock to
replace only a small part of setup_apic_timer; better ideas are welcome.

Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
arch/x86/include/asm/apic.h | 2 ++
arch/x86/include/asm/x86_init.h | 5 +++
arch/x86/kernel/apic/apic.c | 15 +++++---
arch/x86/kernel/kvmclock.c | 78 +++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/x86_init.c | 1 +
5 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bc27611fa58f..cc4f45f66218 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -135,6 +135,7 @@ extern void init_apic_mappings(void);
void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
+extern void setup_APIC_clockev(struct clock_event_device *levt);
extern int APIC_init_uniprocessor(void);

#ifdef CONFIG_X86_64
@@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
# define setup_boot_APIC_clock x86_init_noop
# define setup_secondary_APIC_clock x86_init_noop
+# define setup_APIC_clockev NULL
#endif /* !CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_X86_X2APIC
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 4dcdf74dfed8..d0f099ab4dba 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -7,6 +7,7 @@ struct mpc_bus;
struct mpc_cpu;
struct mpc_table;
struct cpuinfo_x86;
+struct clock_event_device;

/**
* struct x86_init_mpparse - platform specific mpparse ops
@@ -84,11 +85,15 @@ struct x86_init_paging {
* boot cpu
* @timer_init: initialize the platform timer (default PIT/HPET)
* @wallclock_init: init the wallclock device
+ * @setup_APIC_clockev: tweak the clock_event_device for the LAPIC timer,
+ * if setup_boot_APIC_clock and/or
+ * setup_secondary_APIC_clock are in use
*/
struct x86_init_timers {
void (*setup_percpu_clockev)(void);
void (*timer_init)(void);
void (*wallclock_init)(void);
+ void (*setup_APIC_clockev)(struct clock_event_device *levt);
};

/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 60078a67d7e3..b7a331f329d0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -558,15 +558,20 @@ static void setup_APIC_timer(void)
memcpy(levt, &lapic_clockevent, sizeof(*levt));
levt->cpumask = cpumask_of(smp_processor_id());

+ x86_init.timers.setup_APIC_clockev(levt);
+ clockevents_register_device(levt);
+}
+
+void setup_APIC_clockev(struct clock_event_device *levt)
+{
if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
CLOCK_EVT_FEAT_DUMMY);
+ levt->min_delta_ticks = 0xF;
+ levt->max_delta_ticks = ~0UL;
levt->set_next_event = lapic_next_deadline;
- clockevents_config_and_register(levt,
- (tsc_khz / TSC_DIVISOR) * 1000,
- 0xF, ~0UL);
- } else
- clockevents_register_device(levt);
+ clockevents_config(levt, (tsc_khz / TSC_DIVISOR) * 1000);
+ }
}

/*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1d39bfbd26bb..61e094207339 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -17,6 +17,7 @@
*/

#include <linux/clocksource.h>
+#include <linux/clockchips.h>
#include <linux/kvm_para.h>
#include <asm/pvclock.h>
#include <asm/msr.h>
@@ -245,6 +246,82 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}

+/*
+ * We already have the inverse of the (mult,shift) pair, though this means
+ * we need a division. To avoid it we could compute a multiplicative inverse
+ * every time src->version changes.
+ */
+#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38
+#define KVMCLOCK_TSC_DEADLINE_MAX ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
+
+static int lapic_next_ktime(ktime_t expires,
+ struct clock_event_device *evt)
+{
+ u64 ns, tsc;
+ u32 version;
+ int cpu;
+ struct pvclock_vcpu_time_info *src;
+
+ cpu = smp_processor_id();
+ src = &hv_clock[cpu].pvti;
+ ns = ktime_to_ns(expires);
+ do {
+ u64 delta_ns;
+ int shift;
+
+ version = src->version;
+ virt_rmb();
+ if (unlikely(ns < src->system_time)) {
+ tsc = src->tsc_timestamp;
+ virt_rmb();
+ continue;
+ }
+
+ delta_ns = ns - src->system_time;
+
+ /* Cap the wait to avoid overflow. */
+ if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
+ delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
+
+ /*
+ * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
+ * The shift is split in two steps so that a 38 bits (275 s)
+ * deadline fits into the 64-bit dividend.
+ */
+ shift = 32 - src->tsc_shift;
+
+ /* First shift step... */
+ delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
+ shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
+
+ /* ... division... */
+ tsc = div_u64(delta_ns, src->tsc_to_system_mul);
+
+ /* ... and second shift step for the remaining bits. */
+ if (shift >= 0)
+ tsc <<= shift;
+ else
+ tsc >>= -shift;
+
+ tsc += src->tsc_timestamp;
+ virt_rmb();
+ } while((src->version & 1) || version != src->version);
+
+ wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
+ return 0;
+}
+
+
+static void kvm_setup_tsc_deadline_timer(struct clock_event_device *levt)
+{
+ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_DUMMY);
+ levt->features |= CLOCK_EVT_FEAT_KTIME;
+ levt->set_next_ktime = lapic_next_ktime;
+ }
+}
+
void __init kvmclock_init(void)
{
struct pvclock_vcpu_time_info *vcpu_time;
@@ -288,6 +365,7 @@ void __init kvmclock_init(void)
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
put_cpu();

+ x86_init.timers.setup_APIC_clockev = kvm_setup_tsc_deadline_timer;
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index dad5fe9633a3..b85a323208dc 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
.setup_percpu_clockev = setup_boot_APIC_clock,
.timer_init = hpet_time_init,
.wallclock_init = x86_init_noop,
+ .setup_APIC_clockev = setup_APIC_clockev,
},

.iommu = {
--
1.8.3.1