Re: [PATCH v3 7/7] x86/kvm: use Enlightened VMCS when running on Hyper-V

From: Vitaly Kuznetsov
Date: Thu Mar 15 2018 - 05:56:52 EST


Paolo Bonzini <pbonzini@xxxxxxxxxx> writes:

> On 09/03/2018 15:02, Vitaly Kuznetsov wrote:
>> Enlightened VMCS is just a structure in memory, the main benefit
>> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
>> mask: we tell the underlying hypervisor which fields were modified
>> since VMEXIT so there's no need to inspect them all.
>>
>> Tight CPUID loop test shows significant speedup:
>> Before: 18890 cycles
>> After: 8304 cycles
>>
>> Static key is being used to avoid performance penalty for non-Hyper-V
>> deployments. Tests show we add around 3 (three) CPU cycles on each
>> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>> but I don't see a clean way to use static key in assembly.
>
> If you want to live dangerously, you can use text_poke_early to change
> the vmwrite to mov. It's just a single instruction, so it's probably
> not too hard.
>

I'd say it's not worth it ...

>> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
>> ---
>> Changes since v2:
>> - define KVM_EVMCS_VERSION [Radim KrÄmÃÅ]
>> - WARN_ONCE in get_evmcs_offset[,_cf] [Radim KrÄmÃÅ]
>> - add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and
>> dump_vmcs() [Radim KrÄmÃÅ]
>> ---
>> arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 615 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 051dab74e4e9..44b6efa7d54e 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -53,6 +53,7 @@
>> #include <asm/mmu_context.h>
>> #include <asm/microcode.h>
>> #include <asm/nospec-branch.h>
>> +#include <asm/mshyperv.h>
>>
>> #include "trace.h"
>> #include "pmu.h"
>> @@ -1000,6 +1001,484 @@ static const u32 vmx_msr_index[] = {
>> MSR_EFER, MSR_TSC_AUX, MSR_STAR,
>> };
>>
>> +DEFINE_STATIC_KEY_FALSE(enable_evmcs);
>> +
>> +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +static bool __read_mostly enlightened_vmcs = true;
>> +module_param(enlightened_vmcs, bool, 0444);
>> +
>> +#define KVM_EVMCS_VERSION 1
>> +
>> +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
>> +#define EVMCS1_FIELD(number, name, clean_mask)[ROL16(number, 6)] = \
>> + (u32)EVMCS1_OFFSET(name) | ((u32)clean_mask << 16)
>> +
>> +/*
>> + * Lower 16 bits encode offset of the field in struct hv_enlightened_vmcs,
>> + * upped 16 bits hold clean field mask.
>> + */
>> +static const u32 vmcs_field_to_evmcs_1[] = {
>> + /* 64 bit rw */
>> + EVMCS1_FIELD(GUEST_RIP, guest_rip,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
>
> Maybe we should use a single "#include"d file (like vmx_shadow_fields.h)
> and share it between HV-on-KVM and KVM-on-HV.
>
> ...

Actually, yes, looking at 13k+ lines of code in vmx.c makes me think
it's time we start doing something about it :-)

>
>> + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>
> We shouldn't use these on Hyper-V, should we (that is, shouldn't the
> WARN below fire if you try---and so why include them at all)?
>

True, these shouldn't be used and that's why there's no clean field
assigned to them. They, however, do have a corresponding eVMCS field.
I will try removing them in next version.

>> +
>> +static inline u16 get_evmcs_offset(unsigned long field)
>> +{
>> + unsigned int index = ROL16(field, 6);
>> +
>> + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
>> + WARN_ONCE(1, "kvm: reading unsupported EVMCS field %lx\n",
>> + field);
>> + return 0;
>> + }
>> +
>> + return (u16)vmcs_field_to_evmcs_1[index];
>> +}
>> +
>> +static inline u16 get_evmcs_offset_cf(unsigned long field, u16 *clean_field)
>> +{
>> + unsigned int index = ROL16(field, 6);
>> + u32 evmcs_field;
>> +
>> + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
>> + WARN_ONCE(1, "kvm: writing unsupported EVMCS field %lx\n",
>> + field);
>> + return 0;
>> + }
>> +
>> + evmcs_field = vmcs_field_to_evmcs_1[index];
>> +
>> + *clean_field = evmcs_field >> 16;
>> +
>> + return (u16)evmcs_field;
>> +}
>
> You can mark this __always_inline, and make it
>
> if (clean_field)
> *clean_field = evmcs_field >> 16;
>
> or alternatively, use a two-element struct and do
>
> evmcs_field = &vmcs_field_to_evmcs_1[index];
> if (clean_field)
> *clean_field = evmcs_field->clean_field;
> return evmcs_field->offset;
>
> Also, if you return int and make the WARN_ONCE case return -ENOENT, GCC
> should be able to optimize out the "if (!offset)" (which becomes "if
> (offset < 0)") in the callers. Nitpicking, but...
>

Ok, good suggestion, I'll try.

>> +static void vmcs_load_enlightened(u64 phys_addr)
>> +{
>> + struct hv_vp_assist_page *vp_ap =
>> + hv_get_vp_assist_page(smp_processor_id());
>> +
>> + vp_ap->current_nested_vmcs = phys_addr;
>> + vp_ap->enlighten_vmentry = 1;
>> +}
>
> evmcs_load?
>

Works for me,

>> +static void evmcs_sanitize_exec_ctrls(u32 *cpu_based_2nd_exec_ctrl,
>> + u32 *pin_based_exec_ctrl)
>> +{
>> + *pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
>> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>> + *pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
>
> How can these be set?
>

They can not if Hyper-V behaves but Radim didn't want to trust it -- so
the suggestion was to forcefully disable unsupported controls.

>> @@ -3596,6 +4104,14 @@ static int hardware_enable(void)
>> if (cr4_read_shadow() & X86_CR4_VMXE)
>> return -EBUSY;
>>
>> + /*
>> + * This can happen if we hot-added a CPU but failed to allocate
>> + * VP assist page for it.
>> + */
>> + if (static_branch_unlikely(&enable_evmcs) &&
>> + !hv_get_vp_assist_page(cpu))
>> + return -EFAULT;
>
> -ENODEV? Maybe add a printk, because this is really rare.
>

Ok,

>> INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
>> INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
>> spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
>> @@ -3829,7 +4345,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>> vmcs_conf->size = vmx_msr_high & 0x1fff;
>> vmcs_conf->order = get_order(vmcs_conf->size);
>> vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
>> - vmcs_conf->revision_id = vmx_msr_low;
>> +
>> + /* KVM supports Enlightened VMCS v1 only */
>> + if (static_branch_unlikely(&enable_evmcs))
>> + vmcs_conf->revision_id = KVM_EVMCS_VERSION;
>> + else
>> + vmcs_conf->revision_id = vmx_msr_low;
>>
>> vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
>> vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
>> @@ -6990,6 +7511,17 @@ static __init int hardware_setup(void)
>> goto out;
>> }
>>
>> + if (static_branch_unlikely(&enable_evmcs)) {
>> + evmcs_sanitize_exec_ctrls(&vmcs_config.cpu_based_2nd_exec_ctrl,
>> + &vmcs_config.pin_based_exec_ctrl);
>
> Why not do it in setup_vmcs_config after the vmcs_conf->vmentry_ctrl
> assignment (and pass &vmcs_config, which there is "vmcs_conf", directly
> to the function)? And if sanitizing clears the bits in vmentry_ctl and
> vmexit_ctl, there's no need to clear cpu_has_load_perf_global_ctrl.
>

Ok, if we decide to keep 'sanitization' in place.

>> + /*
>> + * Enlightened VMCSv1 doesn't support these:
>> + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
>> + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
>> + */
>> + cpu_has_load_perf_global_ctrl = false;> + }
>> +
>> if (boot_cpu_has(X86_FEATURE_NX))
>> kvm_enable_efer_bits(EFER_NX);
>>
>> @@ -8745,6 +9277,10 @@ static void dump_vmcs(void)
>> if (cpu_has_secondary_exec_ctrls())
>> secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>>
>> + if (static_branch_unlikely(&enable_evmcs))
>> + evmcs_sanitize_exec_ctrls(&secondary_exec_control,
>> + &pin_based_exec_ctrl);
>
> This is wrong, we're reading the VMCS so the values must already be
> sanitized (and if not, that's the bug and we want dump_vmcs to print the
> "wrong" values).

The problem is that we vmcs_read these fields later in the function and
this will now WARN(). Initally, there was no WARN() for non-existent
fields so this could work (we would just print zeroes for unsupported
fields). Maybe, additional WARN_ON() is not a big deal here.

In reality, these controls should never be set.

>
>> pr_err("*** Guest State ***\n");
>> pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
>> vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
>> @@ -8784,7 +9320,8 @@ static void dump_vmcs(void)
>> pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
>> vmcs_read64(GUEST_IA32_DEBUGCTL),
>> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
>> - if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
>> + if (cpu_has_load_perf_global_ctrl &&
>> + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
>> pr_err("PerfGlobCtl = 0x%016llx\n",
>> vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
>> if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
>> @@ -8820,7 +9357,8 @@ static void dump_vmcs(void)
>> pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
>> vmcs_read64(HOST_IA32_EFER),
>> vmcs_read64(HOST_IA32_PAT));
>> - if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
>> + if (cpu_has_load_perf_global_ctrl &&
>> + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
>> pr_err("PerfGlobCtl = 0x%016llx\n",
>> vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
>>
>> @@ -9397,7 +9935,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
>> static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> {
>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>> - unsigned long cr3, cr4;
>> + unsigned long cr3, cr4, evmcs_rsp;
>>
>> /* Record the guest's net vcpu time for enforced NMI injections. */
>> if (unlikely(!enable_vnmi &&
>> @@ -9463,6 +10001,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>>
>> vmx->__launched = vmx->loaded_vmcs->launched;
>> +
>> + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
>> + (unsigned long)&current_evmcs->host_rsp : 0;
>
> (If you use text_poke_early, you can do this assignment unconditionally,
> since it's just a single lea instruction).
>

Something to take a look at)

>> @@ -9604,6 +10152,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>> /* Eliminate branch target predictions from guest mode */
>> vmexit_fill_RSB();
>>
>> + /* All fields are clean at this point */
>> + if (static_branch_unlikely(&enable_evmcs))
>> + current_evmcs->hv_clean_fields |=
>> + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
>> +
>> /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
>> if (vmx->host_debugctlmsr)
>> update_debugctlmsr(vmx->host_debugctlmsr);
>> @@ -12419,7 +12972,36 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>>
>> static int __init vmx_init(void)
>> {
>> - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
>> + int r;
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> + /*
>> + * Enlightened VMCS usage should be recommended and the host needs
>> + * to support eVMCS v1 or above. We can also disable eVMCS support
>> + * with module parameter.
>> + */
>> + if (enlightened_vmcs &&
>> + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
>> + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
>> + KVM_EVMCS_VERSION) {
>> + int cpu;
>> +
>> + /* Check that we have assist pages on all online CPUs */
>> + for_each_online_cpu(cpu) {
>> + if (!hv_get_vp_assist_page(cpu)) {
>> + enlightened_vmcs = false;
>> + break;
>> + }
>> + }
>> + if (enlightened_vmcs) {
>> + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
>> + static_branch_enable(&enable_evmcs);
>> + }
>> + }
>
> A bit nicer to clear enlightened_vmcs in the "else" branch?

Yes, as a precaution, why not. (But we should solely rely on
'enable_evmcs' later on).

>
> That's it. Nice work!
>
> Paolo
>
>> +#endif
>> +
>> + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
>> __alignof__(struct vcpu_vmx), THIS_MODULE);
>> if (r)
>> return r;
>> @@ -12440,6 +13022,29 @@ static void __exit vmx_exit(void)
>> #endif
>>
>> kvm_exit();
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> + if (static_branch_unlikely(&enable_evmcs)) {
>> + int cpu;
>> + struct hv_vp_assist_page *vp_ap;
>> + /*
>> + * Reset everything to support using non-enlightened VMCS
>> + * access later (e.g. when we reload the module with
>> + * enlightened_vmcs=0)
>> + */
>> + for_each_online_cpu(cpu) {
>> + vp_ap = hv_get_vp_assist_page(cpu);
>> +
>> + if (!vp_ap)
>> + continue;
>> +
>> + vp_ap->current_nested_vmcs = 0;
>> + vp_ap->enlighten_vmentry = 0;
>> + }
>> +
>> + static_branch_disable(&enable_evmcs);
>> + }
>> +#endif
>> }
>>
>> module_init(vmx_init)
>>

--
Vitaly