Re: [PATCH 3/6] x86/hyper-v: reenlightenment notifications support

From: Vitaly Kuznetsov
Date: Mon Dec 11 2017 - 04:56:50 EST


Roman Kagan <rkagan@xxxxxxxxxxxxx> writes:

> On Fri, Dec 08, 2017 at 11:49:57AM +0100, Vitaly Kuznetsov wrote:
>> Hyper-V supports Live Migration notification. This is supposed to be used
>> in conjunction with TSC emulation: when we are migrated to a host with
>> different TSC frequency for some short period host emulates our accesses
>> to TSC and sends us an interrupt to notify about the event. When we're
>> done updating everything we can disable TSC emulation and everything will
>> start working fast again.
>>
>> We didn't need these notifications before as Hyper-V guests are not
>> supposed to use TSC as a clocksource: in Linux we even mark it as unstable
>> on boot. Guests normally use 'tsc page' clocksouce and host updates its
>> values on migrations automatically.
>>
>> Things change when we want to run nested virtualization: even when we pass
>> through PV clocksources (kvm-clock or tsc page) to our guests we need to
>> know TSC frequency and when it changes.
>>
>> Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies
>> EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The
>> right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that
>> the fix in on the way.
>>
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
>> ---
>> RFC -> v1:
>> - #include <asm/apic.h> [kbuild test robot]
>> - use div64_u64() [kbuild test robot]
>> - DECLARE_WORK -> DECLARE_DELAYED_WORK as testing showed there's some bug
>> in Hyper-V hypervisor and disabling emulation after receiving interrupt
>> may screw TSC counters.
>
> Looks kinda fragile...

I believe this is temporary and Microsoft will fix things up host side.

>
>> ---
>> arch/x86/entry/entry_64.S | 4 +++
>> arch/x86/hyperv/hv_init.c | 71 ++++++++++++++++++++++++++++++++++++++
>> arch/x86/include/asm/entry_arch.h | 4 +++
>> arch/x86/include/asm/hw_irq.h | 1 +
>> arch/x86/include/asm/irq_vectors.h | 7 +++-
>> arch/x86/include/asm/mshyperv.h | 8 +++++
>> arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++++++++
>> arch/x86/kernel/idt.c | 3 ++
>> 8 files changed, 124 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> index f81d50d7ceac..a32730b260bc 100644
>> --- a/arch/x86/entry/entry_64.S
>> +++ b/arch/x86/entry/entry_64.S
>> @@ -826,6 +826,10 @@ apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
>> apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
>> #endif
>>
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +apicinterrupt HYPERV_REENLIGHTENMENT_VECTOR hyperv_reenlightenment_intr smp_hyperv_reenlightenment_intr
>> +#endif
>> +
>> /*
>> * Exception entry points.
>> */
>> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
>> index 1a6c63f721bc..bb62839ede81 100644
>> --- a/arch/x86/hyperv/hv_init.c
>> +++ b/arch/x86/hyperv/hv_init.c
>> @@ -18,6 +18,7 @@
>> */
>>
>> #include <linux/types.h>
>> +#include <asm/apic.h>
>> #include <asm/hypervisor.h>
>> #include <asm/hyperv.h>
>> #include <asm/mshyperv.h>
>> @@ -102,6 +103,76 @@ static int hv_cpu_init(unsigned int cpu)
>> return 0;
>> }
>>
>> +static void (*hv_reenlightenment_cb)(void);
>> +
>> +static void hv_reenlightenment_notify(struct work_struct *dummy)
>> +{
>> + if (hv_reenlightenment_cb)
>> + hv_reenlightenment_cb();
>> +}
>> +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
>> +
>> +void hyperv_stop_tsc_emulation(void)
>> +{
>> + u64 freq;
>> + struct hv_tsc_emulation_status emu_status;
>> +
>> + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
>> + emu_status.inprogress = 0;
>> + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
>> +
>> + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
>
> IIRC the availability of this msr is not guaranteed (I guess in reality
> it's present if the reenlightenment is supported, but I'd rather check).
>

Will do.

>> + tsc_khz = div64_u64(freq, 1000);
>> +}
>> +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
>> +
>> +void register_hv_tsc_update(void (*cb)(void))
>> +{
>
> The function name seems unfortunate. IMHO such a name suggests
> registering a callback on a notifier chain (rather than unconditionally
> replacing the old callback), and having no other side effects.

I see, any suggestion? register_hv_reenlightenment_cb? register_hv_tscchange_cb?

>
>> + struct hv_reenlightenment_control re_ctrl = {
>> + .vector = HYPERV_REENLIGHTENMENT_VECTOR,
>> + .enabled = 1,
>> + .target_vp = hv_vp_index[smp_processor_id()]
>> + };
>> + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
>> +
>> + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
>> + return;
>
> What happens then? L2 guests keep running with their clocks ticking at
> a different speed?
>

In reallity this never happens -- in case nested virtualization is
supported reenlightenment is also available. In theory, L0 can emulate
TSC acceess for forever after migration.

>> +
>> + hv_reenlightenment_cb = cb;
>> +
>> + /* Make sure callback is registered before we write to MSRs */
>> + wmb();
>> +
>> + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
>> + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
>> +}
>> +EXPORT_SYMBOL_GPL(register_hv_tsc_update);
>> +
>> +void unregister_hv_tsc_update(void)
>> +{
>> + struct hv_reenlightenment_control re_ctrl;
>> +
>> + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
>> + return;
>> +
>> + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
>> + re_ctrl.enabled = 0;
>> + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
>> +
>> + hv_reenlightenment_cb = NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_hv_tsc_update);
>> +
>> +asmlinkage __visible void
>> +__irq_entry smp_hyperv_reenlightenment_intr(struct pt_regs *regs)
>> +{
>> + entering_ack_irq();
>> +
>> + schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
>> +
>> + exiting_irq();
>> +}
>> +
>> /*
>> * This function is to be invoked early in the boot sequence after the
>> * hypervisor has been detected.
>> diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
>> index 416422762845..eb936cc49b62 100644
>> --- a/arch/x86/include/asm/entry_arch.h
>> +++ b/arch/x86/include/asm/entry_arch.h
>> @@ -54,3 +54,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
>> BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
>> #endif
>> #endif
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +BUILD_INTERRUPT(hyperv_reenlightenment_intr, HYPERV_REENLIGHTENMENT_VECTOR)
>> +#endif
>> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
>> index 2851077b6051..c65193dac7d9 100644
>> --- a/arch/x86/include/asm/hw_irq.h
>> +++ b/arch/x86/include/asm/hw_irq.h
>> @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
>> extern asmlinkage void kvm_posted_intr_nested_ipi(void);
>> extern asmlinkage void error_interrupt(void);
>> extern asmlinkage void irq_work_interrupt(void);
>> +extern asmlinkage void hyperv_reenlightenment_intr(void);
>>
>> extern asmlinkage void spurious_interrupt(void);
>> extern asmlinkage void thermal_interrupt(void);
>> diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
>> index 67421f649cfa..e71c1120426b 100644
>> --- a/arch/x86/include/asm/irq_vectors.h
>> +++ b/arch/x86/include/asm/irq_vectors.h
>> @@ -103,7 +103,12 @@
>> #endif
>>
>> #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
>> -#define LOCAL_TIMER_VECTOR 0xee
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee
>> +#endif
>> +
>> +#define LOCAL_TIMER_VECTOR 0xed
>>
>> #define NR_VECTORS 256
>>
>> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
>> index a0b34994f453..43164b097585 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -314,11 +314,19 @@ void hyper_alloc_mmu(void);
>> void hyperv_report_panic(struct pt_regs *regs, long err);
>> bool hv_is_hypercall_page_setup(void);
>> void hyperv_cleanup(void);
>> +
>> +asmlinkage void smp_hyperv_reenlightenment_intr(struct pt_regs *regs);
>> +void register_hv_tsc_update(void (*cb)(void));
>> +void unregister_hv_tsc_update(void);
>> +void hyperv_stop_tsc_emulation(void);
>> #else /* CONFIG_HYPERV */
>> static inline void hyperv_init(void) {}
>> static inline bool hv_is_hypercall_page_setup(void) { return false; }
>> static inline void hyperv_cleanup(void) {}
>> static inline void hyperv_setup_mmu_ops(void) {}
>> +static inline void register_hv_tsc_update(void (*cb)(void)) {}
>> +static inline void unregister_hv_tsc_update(void) {}
>> +static inline void hyperv_stop_tsc_emulation(void) {};
>> #endif /* CONFIG_HYPERV */
>>
>> #ifdef CONFIG_HYPERV_TSCPAGE
>> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
>> index 1a5bfead93b4..197c2e6c7376 100644
>> --- a/arch/x86/include/uapi/asm/hyperv.h
>> +++ b/arch/x86/include/uapi/asm/hyperv.h
>> @@ -40,6 +40,9 @@
>> */
>> #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
>>
>> +/* AccessReenlightenmentControls privilege */
>> +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
>> +
>> /*
>> * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
>> * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
>> @@ -234,6 +237,30 @@
>> #define HV_X64_MSR_CRASH_PARAMS \
>> (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
>>
>> +/* TSC emulation after migration */
>> +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
>> +
>> +struct hv_reenlightenment_control {
>> + u64 vector:8;
>> + u64 reserved1:8;
>> + u64 enabled:1;
>> + u64 reserved2:15;
>> + u64 target_vp:32;
>> +};
>> +
>> +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
>> +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
>> +
>> +struct hv_tsc_emulation_control {
>> + u64 enabled:1;
>> + u64 reserved:63;
>> +};
>> +
>> +struct hv_tsc_emulation_status {
>> + u64 inprogress:1;
>> + u64 reserved:63;
>> +};
>> +
>> #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
>> #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
>> #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
>> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
>> index d985cef3984f..5b8512d48aa3 100644
>> --- a/arch/x86/kernel/idt.c
>> +++ b/arch/x86/kernel/idt.c
>> @@ -140,6 +140,9 @@ static const __initdata struct idt_data apic_idts[] = {
>> # ifdef CONFIG_IRQ_WORK
>> INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
>> # endif
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> + INTG(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr),
>> +#endif
>> INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
>> INTG(ERROR_APIC_VECTOR, error_interrupt),
>> #endif
>
> Roman.

--
Vitaly