Re: [PATCH v2 2/2] KVM: nSVM: improve SYSENTER emulation on AMD

From: Vitaly Kuznetsov
Date: Thu Apr 01 2021 - 14:31:45 EST


Maxim Levitsky <mlevitsk@xxxxxxxxxx> writes:

> Currently to support Intel->AMD migration, if CPU vendor is GenuineIntel,
> we emulate the full 64 value for MSR_IA32_SYSENTER_{EIP|ESP}
> msrs, and we also emulate the sysenter/sysexit instruction in long mode.
>
> (Emulator does still refuse to emulate sysenter in 64 bit mode, on the
> ground that the code for that wasn't tested and likely has no users)
>
> However when virtual vmload/vmsave is enabled, the vmload instruction will
> update these 32 bit msrs without triggering their msr intercept,
> which will lead to having stale values in kvm's shadow copy of these msrs,
> which relies on the intercept to be up to date.
>
> Fix/optimize this by doing the following:
>
> 1. Enable the MSR intercepts for SYSENTER MSRs iff vendor=GenuineIntel
> (This is both a tiny optimization and also ensures that in case
> the guest cpu vendor is AMD, the msrs will be 32 bit wide as
> AMD defined).
>
> 2. Store only high 32 bit part of these msrs on interception and combine
> it with hardware msr value on intercepted read/writes
> iff vendor=GenuineIntel.
>
> 3. Disable vmload/vmsave virtualization if vendor=GenuineIntel.
> (It is somewhat insane to set vendor=GenuineIntel and still enable
> SVM for the guest but well whatever).
> Then zero the high 32 bit parts when kvm intercepts and emulates vmload.
>
> Thanks a lot to Paulo Bonzini for helping me with fixing this in the most

s/Paulo/Paolo/ :-)

> correct way.
>
> This patch fixes nested migration of 32 bit nested guests, that was
> broken because incorrect cached values of SYSENTER msrs were stored in
> the migration stream if L1 changed these msrs with
> vmload prior to L2 entry.
>
> Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
> ---
> arch/x86/kvm/svm/svm.c | 99 +++++++++++++++++++++++++++---------------
> arch/x86/kvm/svm/svm.h | 6 +--
> 2 files changed, 68 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 271196400495..6c39b0cd6ec6 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -95,6 +95,8 @@ static const struct svm_direct_access_msrs {
> } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
> { .index = MSR_STAR, .always = true },
> { .index = MSR_IA32_SYSENTER_CS, .always = true },
> + { .index = MSR_IA32_SYSENTER_EIP, .always = false },
> + { .index = MSR_IA32_SYSENTER_ESP, .always = false },
> #ifdef CONFIG_X86_64
> { .index = MSR_GS_BASE, .always = true },
> { .index = MSR_FS_BASE, .always = true },
> @@ -1258,16 +1260,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
> if (kvm_vcpu_apicv_active(vcpu))
> avic_init_vmcb(svm);
>
> - /*
> - * If hardware supports Virtual VMLOAD VMSAVE then enable it
> - * in VMCB and clear intercepts to avoid #VMEXIT.
> - */
> - if (vls) {
> - svm_clr_intercept(svm, INTERCEPT_VMLOAD);
> - svm_clr_intercept(svm, INTERCEPT_VMSAVE);
> - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
> - }
> -
> if (vgif) {
> svm_clr_intercept(svm, INTERCEPT_STGI);
> svm_clr_intercept(svm, INTERCEPT_CLGI);
> @@ -2133,9 +2125,11 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
>
> ret = kvm_skip_emulated_instruction(vcpu);
>
> - if (vmload)
> + if (vmload) {
> nested_svm_vmloadsave(vmcb12, svm->vmcb);
> - else
> + svm->sysenter_eip_hi = 0;
> + svm->sysenter_esp_hi = 0;
> + } else
> nested_svm_vmloadsave(svm->vmcb, vmcb12);

Nitpicking: {} are now needed for both branches here.

>
> kvm_vcpu_unmap(vcpu, &map, true);
> @@ -2677,10 +2671,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
> break;
> case MSR_IA32_SYSENTER_EIP:
> - msr_info->data = svm->sysenter_eip;
> + msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
> + if (guest_cpuid_is_intel(vcpu))
> + msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
> break;
> case MSR_IA32_SYSENTER_ESP:
> - msr_info->data = svm->sysenter_esp;
> + msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
> + if (guest_cpuid_is_intel(vcpu))
> + msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
> break;
> case MSR_TSC_AUX:
> if (!boot_cpu_has(X86_FEATURE_RDTSCP))
> @@ -2885,12 +2883,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
> svm->vmcb01.ptr->save.sysenter_cs = data;
> break;
> case MSR_IA32_SYSENTER_EIP:
> - svm->sysenter_eip = data;
> - svm->vmcb01.ptr->save.sysenter_eip = data;
> + svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
> + /*
> + * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
> + * when we spoof an Intel vendor ID (for cross vendor migration).
> + * In this case we use this intercept to track the high
> + * 32 bit part of these msrs to support Intel's
> + * implementation of SYSENTER/SYSEXIT.
> + */
> + svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;

(Personal taste) I'd suggest we keep the whole 'sysenter_eip'/'sysenter_esp'
even if we only use the upper 32 bits of it. That would reduce the code
churn a little bit (no need to change 'struct vcpu_svm').

> break;
> case MSR_IA32_SYSENTER_ESP:
> - svm->sysenter_esp = data;
> - svm->vmcb01.ptr->save.sysenter_esp = data;
> + svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
> + svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
> break;
> case MSR_TSC_AUX:
> if (!boot_cpu_has(X86_FEATURE_RDTSCP))
> @@ -4009,24 +4014,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
> }
>
> - if (!kvm_vcpu_apicv_active(vcpu))
> - return;
> + if (kvm_vcpu_apicv_active(vcpu)) {
> + /*
> + * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
> + * is exposed to the guest, disable AVIC.
> + */
> + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
> + kvm_request_apicv_update(vcpu->kvm, false,
> + APICV_INHIBIT_REASON_X2APIC);
>
> - /*
> - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
> - * is exposed to the guest, disable AVIC.
> - */
> - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
> - kvm_request_apicv_update(vcpu->kvm, false,
> - APICV_INHIBIT_REASON_X2APIC);
> + /*
> + * Currently, AVIC does not work with nested virtualization.
> + * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
> + */
> + if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
> + kvm_request_apicv_update(vcpu->kvm, false,
> + APICV_INHIBIT_REASON_NESTED);
> + }
>
> - /*
> - * Currently, AVIC does not work with nested virtualization.
> - * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
> - */
> - if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
> - kvm_request_apicv_update(vcpu->kvm, false,
> - APICV_INHIBIT_REASON_NESTED);
> + if (guest_cpuid_is_intel(vcpu)) {
> + /*
> + * We must intercept SYSENTER_EIP and SYSENTER_ESP
> + * accesses because the processor only stores 32 bits.
> + * For the same reason we cannot use virtual VMLOAD/VMSAVE.
> + */
> + svm_set_intercept(svm, INTERCEPT_VMLOAD);
> + svm_set_intercept(svm, INTERCEPT_VMSAVE);
> + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
> +
> + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
> + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
> + } else {
> + /*
> + * If hardware supports Virtual VMLOAD VMSAVE then enable it
> + * in VMCB and clear intercepts to avoid #VMEXIT.
> + */
> + if (vls) {
> + svm_clr_intercept(svm, INTERCEPT_VMLOAD);
> + svm_clr_intercept(svm, INTERCEPT_VMSAVE);
> + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
> + }
> + /* No need to intercept these MSRs */
> + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
> + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
> + }
> }
>
> static bool svm_has_wbinvd_exit(void)
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 8e276c4fb33d..fffdd5fb446d 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -28,7 +28,7 @@ static const u32 host_save_user_msrs[] = {
> };
> #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
>
> -#define MAX_DIRECT_ACCESS_MSRS 18
> +#define MAX_DIRECT_ACCESS_MSRS 20
> #define MSRPM_OFFSETS 16
> extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
> extern bool npt_enabled;
> @@ -116,8 +116,8 @@ struct vcpu_svm {
> struct kvm_vmcb_info *current_vmcb;
> struct svm_cpu_data *svm_data;
> u32 asid;
> - uint64_t sysenter_esp;
> - uint64_t sysenter_eip;
> + u32 sysenter_esp_hi;
> + u32 sysenter_eip_hi;
> uint64_t tsc_aux;
>
> u64 msr_decfg;

--
Vitaly