Re: [PATCH char-misc 1/1] Drivers: hv: vmbus: Implement Direct Mode for stimer0
From: Vitaly Kuznetsov
Date: Wed Nov 01 2017 - 09:41:21 EST
Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> writes:
> mikelley@xxxxxxxxxxxxxxxxxxxxxx writes:
>
>> From: Michael Kelley <mikelley@xxxxxxxxxxxxx>
>>
>> The 2016 version of Hyper-V offers the option to operate the guest VM
>> per-vcpu stimer's in Direct Mode, which means the timer interupts on its
>> own vector rather than queueing a VMbus message. Direct Mode reduces
>> timer processing overhead in both the hypervisor and the guest, and
>> avoids having timer interrupts pollute the VMbus interrupt stream for
>> the synthetic NIC and storage. This patch enables Direct Mode by
>> default on stimer0 (which is the only stimer used in Linux on Hyper-V)
>> when running on a version of Hyper-V that supports it, with a hv_vmbus
>> module parameter for disabling Direct Mode and reverting to the old
>> behavior.
>>
>> Signed-off-by: Michael Kelley <mikelley@xxxxxxxxxxxxx>
>> ---
>> arch/x86/include/asm/mshyperv.h | 14 ++++++++
>> arch/x86/include/uapi/asm/hyperv.h | 26 ++++++++++++++
>> arch/x86/kernel/cpu/mshyperv.c | 64 +++++++++++++++++++++++++++++++++-
>> drivers/hv/hv.c | 71 ++++++++++++++++++++++++++++++++++++--
>> drivers/hv/hyperv_vmbus.h | 4 ++-
>> 5 files changed, 175 insertions(+), 4 deletions(-)
>>
(in addition to my previous comment)
This patch is also x86-heavy so it probably makes sense to make x86
maintainers (Thomas, Peter, Ingo) aware no matter which tree it will go
through.
CC: x86@xxxxxxxxxx
>> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
>> index 740dc97..1bba1d7 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -4,6 +4,8 @@
>> #include <linux/types.h>
>> #include <linux/atomic.h>
>> #include <linux/nmi.h>
>> +#include <linux/irq.h>
>> +#include <linux/irqdesc.h>
>> #include <asm/io.h>
>> #include <asm/hyperv.h>
>>
>> @@ -374,4 +376,16 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
>> return NULL;
>> }
>> #endif
>> +
>> +/* Per architecture routines for stimer0 Direct Mode handling. On x86/x64
>> + * there are no percpu actions to take.
>> + */
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +static inline void hv_enable_stimer0_percpu_irq(int irq) { }
>> +static inline void hv_disable_stimer0_percpu_irq(int irq) { }
>> +extern int hv_allocate_stimer0_irq(int *irq, int *vector);
>> +extern void hv_deallocate_stimer0_irq(int irq);
>> +extern void hv_ack_stimer0_interrupt(struct irq_desc *desc);
>> +#endif
>> +
>> #endif
>> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
>> index f65d125..408cf3e 100644
>> --- a/arch/x86/include/uapi/asm/hyperv.h
>> +++ b/arch/x86/include/uapi/asm/hyperv.h
>> @@ -112,6 +112,22 @@
>> #define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
>> /* Guest crash data handler available */
>> #define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
>> +/* Debug MSRs available */
>> +#define HV_X64_DEBUG_MSR_AVAILABLE (1 << 11)
>> +/* Support for Non-Privileged Instruction Execution Prevention is available */
>> +#define HV_X64_NPIEP_AVAILABLE (1 << 12)
>> +/* Support for DisableHypervisor is available */
>> +#define HV_X64_DISABLE_HYPERVISOR_AVAILABLE (1 << 13)
>> +/* Extended GVA Ranges for Flush Virtual Address list is available */
>> +#define HV_X64_EXTENDED_GVA_RANGE_AVAILABLE (1 << 14)
>> +/* Return Hypercall output via XMM registers is available */
>> +#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE (1 << 15)
>> +/* SINT polling mode available */
>> +#define HV_X64_SINT_POLLING_MODE_AVAILABLE (1 << 17)
>> +/* Hypercall MSR lock is available */
>> +#define HV_X64_HYPERCALL_MSR_LOCK_AVAILABLE (1 << 18)
>> +/* stimer direct mode is available */
>> +#define HV_X64_STIMER_DIRECT_MODE_AVAILABLE (1 << 19)
>>
>> /*
>> * Implementation recommendations. Indicates which behaviors the hypervisor
>> @@ -300,6 +316,16 @@ enum HV_GENERIC_SET_FORMAT {
>>
>> #define HV_SYNIC_STIMER_COUNT (4)
>>
>> +/* Hardware IRQ number to use for stimer0 in Direct Mode. This IRQ is a fake
>> + * because stimer's in Direct Mode simply interrupt on the specified vector,
>> + * without using a particular IOAPIC pin. But we use the IRQ allocation
>> + * machinery, so we need a hardware IRQ #. This value is somewhat arbitrary,
>> + * but it should not be a legacy IRQ (0 to 15), and should fit within the
>> + * single IOAPIC (0 to 23) that Hyper-V provides to a guest VM. So any value
>> + * between 16 and 23 should be good.
>> + */
>
> (nitpick): all comments in the patch are in 'net' style:
>
> /* This is a
> * comment.
> */
>
> While in Hyper-V files (and x86 in general) the 'normal' style is used:
>
> /*
> * This is a
> * comment.
> */
>
> But checkpatch.pl doesn't complain.
>
>> +#define HV_STIMER0_IRQNR 18
>> +
>> /* Define synthetic interrupt controller message constants. */
>> #define HV_MESSAGE_SIZE (256)
>> #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
>> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
>> index 236324e8..88dc243 100644
>> --- a/arch/x86/kernel/cpu/mshyperv.c
>> +++ b/arch/x86/kernel/cpu/mshyperv.c
>> @@ -19,7 +19,10 @@
>> #include <linux/efi.h>
>> #include <linux/interrupt.h>
>> #include <linux/irq.h>
>> +#include <linux/irqdomain.h>
>> +#include <linux/irqdesc.h>
>> #include <linux/kexec.h>
>> +#include <linux/acpi.h>
>> #include <asm/processor.h>
>> #include <asm/hypervisor.h>
>> #include <asm/hyperv.h>
>> @@ -27,6 +30,7 @@
>> #include <asm/desc.h>
>> #include <asm/irq_regs.h>
>> #include <asm/i8259.h>
>> +#include <asm/irqdomain.h>
>> #include <asm/apic.h>
>> #include <asm/timer.h>
>> #include <asm/reboot.h>
>> @@ -69,6 +73,64 @@ void hv_remove_vmbus_irq(void)
>> EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
>> EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
>>
>> +
>> +/* Routines to do per-architecture handling of stimer0 when in Direct Mode */
>> +
>> +void hv_ack_stimer0_interrupt(struct irq_desc *desc)
>> +{
>> + ack_APIC_irq();
>> +}
>> +
>> +static void allonline_vector_allocation_domain(int cpu, struct cpumask *retmask,
>> + const struct cpumask *mask)
>> +{
>> + cpumask_copy(retmask, cpu_online_mask);
>> +}
>> +
>> +int hv_allocate_stimer0_irq(int *irq, int *vector)
>> +{
>> + int localirq;
>> + int result;
>> + struct irq_data *irq_data;
>> +
>> + /* The normal APIC vector allocation domain allows allocation of vectors
>> + * only for the calling CPU. So we change the allocation domain to one
>> + * that allows vectors to be allocated in all online CPUs. This
>> + * change is fine in a Hyper-V VM because VMs don't have the usual
>> + * complement of interrupting devices.
>> + */
>> + apic->vector_allocation_domain = allonline_vector_allocation_domain;
>> + localirq = acpi_register_gsi(NULL, HV_STIMER0_IRQNR,
>> + ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_HIGH);
>> + if (localirq < 0) {
>> + pr_err("Cannot register stimer0 gsi. Error %d", localirq);
>> + return -1;
>> + }
>> +
>> + /* We pass in a dummy IRQ handler because architecture independent code
>> + * will later override the IRQ domain interrupt handler and set it to a
>> + * Hyper-V specific handler.
>> + */
>> + result = request_irq(localirq, (irq_handler_t)(-1), 0,
>> + "Hyper-V stimer0", NULL);
>
> I remember K. Y. told me he has patches in his stash to implement
> Hyper-V-specific APIC. Is this still in works or there is a reason to
> not do it? Just curious.
>
>> + if (result) {
>> + pr_err("Cannot request stimer0 irq. Error %d", result);
>> + acpi_unregister_gsi(localirq);
>> + return -1;
>> + }
>> + irq_data = irq_domain_get_irq_data(x86_vector_domain, localirq);
>
> In theory irq_domain_get_irq_data() can return NULL.
>
>> + *vector = irqd_cfg(irq_data)->vector;
>> + *irq = localirq;
>> + return 0;
>> +}
>> +
>> +void hv_deallocate_stimer0_irq(int irq)
>> +{
>> + free_irq(irq, NULL);
>> + acpi_unregister_gsi(irq);
>> +}
>> +
>> +
>> void hv_setup_kexec_handler(void (*handler)(void))
>> {
>> hv_kexec_handler = handler;
>> @@ -195,7 +257,7 @@ static void __init ms_hyperv_init_platform(void)
>> hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
>> hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
>>
>> - pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
>> + pr_info("Hyper-V: Host Build %d-%d.%d-%d-%d.%d\n",
>
> While this definitely makes logs nicer uhe change is probably not needed
> in this particular patch.
>
>> hv_host_info_eax, hv_host_info_ebx >> 16,
>> hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
>> hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
>> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
>> index fe96aab..68ac474 100644
>> --- a/drivers/hv/hv.c
>> +++ b/drivers/hv/hv.c
>> @@ -27,8 +27,12 @@
>> #include <linux/vmalloc.h>
>> #include <linux/hyperv.h>
>> #include <linux/version.h>
>> -#include <linux/interrupt.h>
>> +#include <linux/irq.h>
>> +#include <linux/irqdesc.h>
>> +#include <linux/random.h>
>> +#include <linux/kernel_stat.h>
>> #include <linux/clockchips.h>
>> +#include <linux/module.h>
>> #include <asm/hyperv.h>
>> #include <asm/mshyperv.h>
>> #include "hyperv_vmbus.h"
>> @@ -38,6 +42,21 @@ struct hv_context hv_context = {
>> .synic_initialized = false,
>> };
>>
>> +/* If true, we're using Direct Mode for stimer0, and the timer will do it own
>> + * interrupt when it expires. If false, stimer0 is not using Direct Mode and
>> + * will send a VMbus message when it expires. We prefer to use Direct Mode,
>> + * but not all versions of Hyper-V support Direct Mode.
>> + *
>> + * While Hyper-V provides a total of four stimer's per CPU, Linux use only
>> + * stimer0.
>> + */
>> +static bool stimer_direct_mode;
>> +static int stimer0_irq;
>> +static int stimer0_vector;
>> +static bool direct_mode_disable;
>> +module_param(direct_mode_disable, bool, 0444);
>> +MODULE_PARM_DESC(direct_mode_disable, "Set to Y to disable Direct Mode.");
>> +
>> #define HV_TIMER_FREQUENCY (10 * 1000 * 1000) /* 100ns period */
>> #define HV_MAX_MAX_DELTA_TICKS 0xffffffff
>> #define HV_MIN_DELTA_TICKS 1
>> @@ -52,7 +71,12 @@ int hv_init(void)
>> hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context);
>> if (!hv_context.cpu_context)
>> return -ENOMEM;
>> + stimer_direct_mode = (ms_hyperv.misc_features &
>> + HV_X64_STIMER_DIRECT_MODE_AVAILABLE) ? true : false;
>>
>> + /* Apply boot command line override to the Direct Mode setting */
>> + if (direct_mode_disable)
>> + stimer_direct_mode = false;
>
> Having both direct_mode_disable and stimer_direct_mode seems redundant.
>
> we may write something like
>
> if (!direct_mode_disable)
> direct_mode_disable = (ms_hyperv.misc_features & HV_X64_STIMER_DIRECT_MODE_AVAILABLE) ? false : true;
>
> and use it everywhere.
>
>> return 0;
>> }
>>
>> @@ -91,6 +115,23 @@ int hv_post_message(union hv_connection_id connection_id,
>> return status & 0xFFFF;
>> }
>>
>> +/* ISR for when stimer0 is operating in Direct Mode. Direct Mode does
>> + * not use VMBus or any VMBus messages, so process here and not in the
>> + * VMBus driver code.
>> + */
>> +
>> +static void hv_stimer0_isr(struct irq_desc *desc)
>> +{
>> + struct hv_per_cpu_context *hv_cpu;
>> +
>> + __this_cpu_inc(*desc->kstat_irqs);
>> + __this_cpu_inc(kstat.irqs_sum);
>> + hv_ack_stimer0_interrupt(desc);
>> + hv_cpu = this_cpu_ptr(hv_context.cpu_context);
>> + hv_cpu->clk_evt->event_handler(hv_cpu->clk_evt);
>> + add_interrupt_randomness(desc->irq_data.irq, 0);
>
> AFAIU implementing Hyper-V specific IRQ chip would allow us to drop this
> as we would do this on 'normal' Linux IRQ path.
>
>> +}
>> +
>> static int hv_ce_set_next_event(unsigned long delta,
>> struct clock_event_device *evt)
>> {
>> @@ -108,6 +149,8 @@ static int hv_ce_shutdown(struct clock_event_device *evt)
>> {
>> hv_init_timer(HV_X64_MSR_STIMER0_COUNT, 0);
>> hv_init_timer_config(HV_X64_MSR_STIMER0_CONFIG, 0);
>> + if (stimer_direct_mode)
>> + hv_disable_stimer0_percpu_irq(stimer0_irq);
>>
>
> Both hv_disable_stimer0_percpu_irq() and hv_enable_stimer0_percpu_irq()
> are no-ops, right? Why don't we mask/unmask the IRQ? I may have missed
> something...
>
>> return 0;
>> }
>> @@ -116,9 +159,24 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt)
>> {
>> union hv_timer_config timer_cfg;
>>
>> + timer_cfg.as_uint64 = 0; /* Zero everything */
>> timer_cfg.enable = 1;
>> timer_cfg.auto_enable = 1;
>> - timer_cfg.sintx = VMBUS_MESSAGE_SINT;
>> + if (stimer_direct_mode) {
>> +
>> + /* When it expires, the timer will directly interrupt
>> + * on the specific hardware vector.
>> + */
>> + timer_cfg.direct_mode = 1;
>> + timer_cfg.apic_vector = stimer0_vector;
>> + hv_enable_stimer0_percpu_irq(stimer0_irq);
>> + } else {
>> + /* When it expires, the timer will generate a VMbus message,
>> + * to be handled by the normal VMbus interrupt handler.
>> + */
>> + timer_cfg.direct_mode = 0;
>> + timer_cfg.sintx = VMBUS_MESSAGE_SINT;
>> + }
>> hv_init_timer_config(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);
>>
>> return 0;
>> @@ -191,6 +249,12 @@ int hv_synic_alloc(void)
>> INIT_LIST_HEAD(&hv_cpu->chan_list);
>> }
>>
>> + if (stimer_direct_mode) {
>> + if (hv_allocate_stimer0_irq(&stimer0_irq, &stimer0_vector))
>> + goto err;
>> + irq_set_handler(stimer0_irq, hv_stimer0_isr);
>> + }
>> +
>> return 0;
>> err:
>> return -ENOMEM;
>> @@ -292,6 +356,9 @@ void hv_synic_clockevents_cleanup(void)
>> if (!(ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE))
>> return;
>>
>> + if (stimer_direct_mode)
>> + hv_deallocate_stimer0_irq(stimer0_irq);
>> +
>> for_each_present_cpu(cpu) {
>> struct hv_per_cpu_context *hv_cpu
>> = per_cpu_ptr(hv_context.cpu_context, cpu);
>> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
>> index de6f01d..ee8c89b 100644
>> --- a/drivers/hv/hyperv_vmbus.h
>> +++ b/drivers/hv/hyperv_vmbus.h
>> @@ -55,7 +55,9 @@
>> u64 periodic:1;
>> u64 lazy:1;
>> u64 auto_enable:1;
>> - u64 reserved_z0:12;
>> + u64 apic_vector:8;
>> + u64 direct_mode:1;
>> + u64 reserved_z0:3;
>> u64 sintx:4;
>> u64 reserved_z1:44;
>> };
--
Vitaly