Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value

From: David Matlack
Date: Thu Sep 08 2016 - 18:14:19 EST


Hi Paolo,

On Tue, Sep 6, 2016 at 3:29 PM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:
> Bad things happen if a guest using the TSC deadline timer is migrated.
> The guest doesn't re-calibrate the TSC after migration, and the
> TSC frequency can and will change unless your processor supports TSC
> scaling (on Intel this is only Skylake) or your data center is perfectly
> homogeneous.

Sorry, I forgot to follow up on our discussion in v1. One thing we
discussed there was using the APIC Timer to workaround a changing TSC
rate. You pointed out KVM's TSC deadline timer got a nice performance
boost recently, which makes it preferable. Does it makes sense to
apply the same optimization (using the VMX preemption timer) to the
APIC Timer, instead of creating a new dependency on kvmclock?

>
> The solution in this patch is to skip tsc_khz, and instead derive the
> frequency from kvmclock's (mult, shift) pair. Because kvmclock
> parameters convert from tsc to nanoseconds, this needs a division
> but that's the least of our problems when the TSC_DEADLINE_MSR write
> costs 2000 clock cycles. Luckily tsc_khz is really used by very little
> outside the tsc clocksource (which kvmclock replaces) and the TSC
> deadline timer. Because KVM's local APIC doesn't need quirks, we
> provide a paravirt clockevent that still uses the deadline timer
> under the hood (as suggested by Andy Lutomirski).
>
> This patch does not handle the very first deadline, hoping that it
> is masked by the migration downtime (i.e. that the timer fires late
> anyway).
>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> arch/x86/include/asm/apic.h | 1 +
> arch/x86/kernel/apic/apic.c | 2 +-
> arch/x86/kernel/kvmclock.c | 156 ++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 158 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index f6e0bad1cde2..c88b0dcfdf3a 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -53,6 +53,7 @@ extern unsigned int apic_verbosity;
> extern int local_apic_timer_c2_ok;
>
> extern int disable_apic;
> +extern int disable_apic_timer;
> extern unsigned int lapic_timer_frequency;
>
> #ifdef CONFIG_SMP
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 5b63bec7d0af..d0c6d1e3d627 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -169,7 +169,7 @@ __setup("apicpmtimer", setup_apicpmtimer);
> unsigned long mp_lapic_addr;
> int disable_apic;
> /* Disable local APIC timer from the kernel commandline or via dmi quirk */
> -static int disable_apic_timer __initdata;
> +int disable_apic_timer __initdata;
> /* Local APIC timer works in C2 */
> int local_apic_timer_c2_ok;
> EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1d39bfbd26bb..365fa6494dd3 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -17,6 +17,7 @@
> */
>
> #include <linux/clocksource.h>
> +#include <linux/clockchips.h>
> #include <linux/kvm_para.h>
> #include <asm/pvclock.h>
> #include <asm/msr.h>
> @@ -245,6 +246,155 @@ static void kvm_shutdown(void)
> native_machine_shutdown();
> }
>
> +#ifdef CONFIG_X86_LOCAL_APIC
> +/*
> + * kvmclock-based clock event implementation, used only together with the
> + * TSC deadline timer. A subset of the normal LAPIC clockevent, but it
> + * uses kvmclock to convert nanoseconds to TSC. This is necessary to
> + * handle changes to the TSC frequency, e.g. from live migration.
> + */
> +
> +static void kvmclock_lapic_timer_setup(unsigned lvtt_value)
> +{
> + lvtt_value |= LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_TSCDEADLINE;
> + apic_write(APIC_LVTT, lvtt_value);
> +}
> +
> +static int kvmclock_lapic_timer_set_oneshot(struct clock_event_device *evt)
> +{
> + kvmclock_lapic_timer_setup(0);
> + printk_once(KERN_DEBUG "kvmclock: TSC deadline timer enabled\n");
> +
> + /*
> + * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
> + * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
> + * According to Intel, MFENCE can do the serialization here.
> + */
> + asm volatile("mfence" : : : "memory");
> + return 0;
> +}
> +
> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
> +{
> + kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
> + wrmsrl(MSR_IA32_TSC_DEADLINE, -1);
> + return 0;
> +}
> +
> +/*
> + * We already have the inverse of the (mult,shift) pair, though this means
> + * we need a division. To avoid it we could compute a multiplicative inverse
> + * every time src->version changes.
> + */
> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38
> +#define KVMCLOCK_TSC_DEADLINE_MAX ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
> +
> +static int kvmclock_lapic_next_ktime(ktime_t expires,
> + struct clock_event_device *evt)
> +{
> + u64 ns, tsc;
> + u32 version;
> + int cpu;
> + struct pvclock_vcpu_time_info *src;
> +
> + cpu = smp_processor_id();
> + src = &hv_clock[cpu].pvti;
> + ns = ktime_to_ns(expires);
> +
> + do {
> + u64 delta_ns;
> + int shift;
> +
> + version = pvclock_read_begin(src);
> + if (unlikely(ns < src->system_time)) {
> + tsc = src->tsc_timestamp;
> + virt_rmb();
> + continue;
> + }
> +
> + delta_ns = ns - src->system_time;
> +
> + /* Cap the wait to avoid overflow. */
> + if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
> + delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
> +
> + /*
> + * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
> + * The shift is split in two steps so that a 38 bits (275 s)
> + * deadline fits into the 64-bit dividend.
> + */
> + shift = 32 - src->tsc_shift;
> +
> + /* First shift step... */
> + delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> + shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +
> + /* ... division... */
> + tsc = div_u64(delta_ns, src->tsc_to_system_mul);
> +
> + /* ... and second shift step for the remaining bits. */
> + if (shift >= 0)
> + tsc <<= shift;
> + else
> + tsc >>= -shift;
> +
> + tsc += src->tsc_timestamp;
> + } while (pvclock_read_retry(src, version));
> +
> + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
> + return 0;
> +}
> +
> +/*
> + * The local apic timer can be used for any function which is CPU local.
> + */
> +static struct clock_event_device kvm_clockevent = {
> + .name = "lapic",

Should we encode "kvm" or "kvmclock" in the name? I'm not sure how
this name gets used, but it might be nice to distinguish it from the
native TSC deadline timer clock_event_device.

> + /* Under KVM the LAPIC timer always runs in deep C-states. */
> + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
> + .set_state_shutdown = kvmclock_lapic_timer_stop,
> + .set_state_oneshot = kvmclock_lapic_timer_set_oneshot,
> + .set_next_ktime = kvmclock_lapic_next_ktime,
> + .mult = 1,
> + /* Make LAPIC timer preferrable over percpu HPET */
> + .rating = 150,
> + .irq = -1,
> +};
> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
> +
> +static void kvmclock_local_apic_timer_interrupt(void)
> +{
> + int cpu = smp_processor_id();
> + struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
> +
> + /*
> + * Defer to the native clockevent if ours hasn't been setup yet.
> + */
> + if (!evt->event_handler) {
> + native_local_apic_timer_interrupt();
> + return;
> + }
> +
> + inc_irq_stat(apic_timer_irqs);
> + evt->event_handler(evt);
> +}
> +
> +/*
> + * Setup the local APIC timer for this CPU. Copy the initialized values
> + * of the boot CPU and register the clock event in the framework.
> + */
> +static void setup_kvmclock_timer(void)
> +{
> + struct clock_event_device *evt = this_cpu_ptr(&kvm_events);
> +
> + kvmclock_lapic_timer_stop(evt);
> +
> + memcpy(evt, &kvm_clockevent, sizeof(*evt));
> + evt->cpumask = cpumask_of(smp_processor_id());
> + clockevents_register_device(evt);
> +}
> +#endif
> +
> void __init kvmclock_init(void)
> {
> struct pvclock_vcpu_time_info *vcpu_time;
> @@ -292,6 +442,12 @@ void __init kvmclock_init(void)
> x86_platform.get_wallclock = kvm_get_wallclock;
> x86_platform.set_wallclock = kvm_set_wallclock;
> #ifdef CONFIG_X86_LOCAL_APIC
> + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) &&
> + !disable_apic && !disable_apic_timer) {

Request that this be a hypervisor-controllable feature. e.g. we could
add a new bit to KVM's CPUID leaf to indicate kvmclock is the
definitive source of TSC rate.

> + pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt;
> + x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer;
> + x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer;
> + }
> x86_cpuinit.early_percpu_clock_init =
> kvm_setup_secondary_clock;
> #endif
> --
> 1.8.3.1
>