Re: [PATCH] x86: vmx: Allow direct access to MSR_IA32_SPEC_CTRL

From: Jim Mattson
Date: Mon Jan 29 2018 - 14:27:15 EST


On Mon, Jan 29, 2018 at 11:16 AM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
> On Mon, Jan 29, 2018 at 10:43:22AM -0800, Jim Mattson wrote:
>> On Sun, Jan 28, 2018 at 11:29 AM, KarimAllah Ahmed <karahmed@xxxxxxxxx> wrote:
>> > Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests
>> > that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a
>> > retpoline+IBPB based approach.
>> >
>> > To avoid the overhead of atomically saving and restoring the MSR_IA32_SPEC_CTRL
>> > for guests that do not actually use the MSR, only add_atomic_switch_msr when a
>> > non-zero is written to it.
>> >
>> > Cc: Asit Mallick <asit.k.mallick@xxxxxxxxx>
>> > Cc: Arjan Van De Ven <arjan.van.de.ven@xxxxxxxxx>
>> > Cc: Dave Hansen <dave.hansen@xxxxxxxxx>
>> > Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
>> > Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
>> > Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
>> > Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
>> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
>> > Cc: Jun Nakajima <jun.nakajima@xxxxxxxxx>
>> > Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
>> > Cc: David Woodhouse <dwmw@xxxxxxxxxxxx>
>> > Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx>
>> > Cc: Andy Lutomirski <luto@xxxxxxxxxx>
>> > Signed-off-by: KarimAllah Ahmed <karahmed@xxxxxxxxx>
>> > Signed-off-by: Ashok Raj <ashok.raj@xxxxxxxxx>
>> > ---
>> > arch/x86/kvm/cpuid.c | 4 +++-
>> > arch/x86/kvm/cpuid.h | 1 +
>> > arch/x86/kvm/vmx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>> > 3 files changed, 67 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> > index 0099e10..dc78095 100644
>> > --- a/arch/x86/kvm/cpuid.c
>> > +++ b/arch/x86/kvm/cpuid.c
>> > @@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void)
>> > /* These are scattered features in cpufeatures.h. */
>> > #define KVM_CPUID_BIT_AVX512_4VNNIW 2
>> > #define KVM_CPUID_BIT_AVX512_4FMAPS 3
>> > +#define KVM_CPUID_BIT_SPEC_CTRL 26
>> > #define KF(x) bit(KVM_CPUID_BIT_##x)
>> >
>> > int kvm_update_cpuid(struct kvm_vcpu *vcpu)
>> > @@ -392,7 +393,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>> >
>> > /* cpuid 7.0.edx*/
>> > const u32 kvm_cpuid_7_0_edx_x86_features =
>> > - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
>> > + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | \
>> > + (boot_cpu_has(X86_FEATURE_SPEC_CTRL) ? KF(SPEC_CTRL) : 0);
>>
>> Isn't 'boot_cpu_has()' superflous here? And aren't there two bits to
>> pass through for existing CPUs (26 and 27)?
>>
>> >
>> > /* all calls to cpuid_count() should be made on the same cpu */
>> > get_cpu();
>> > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
>> > index cdc70a3..dcfe227 100644
>> > --- a/arch/x86/kvm/cpuid.h
>> > +++ b/arch/x86/kvm/cpuid.h
>> > @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
>> > [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
>> > [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
>> > [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
>> > + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
>> > };
>> >
>> > static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
>> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> > index aa8638a..1b743a0 100644
>> > --- a/arch/x86/kvm/vmx.c
>> > +++ b/arch/x86/kvm/vmx.c
>> > @@ -920,6 +920,9 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
>> > static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
>> > u16 error_code);
>> > static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
>> > +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
>> > + u32 msr, int type);
>> > +
>> >
>> > static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>> > static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>> > @@ -2007,6 +2010,28 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> > m->host[i].value = host_val;
>> > }
>> >
>> > +/* do not touch guest_val and host_val if the msr is not found */
>> > +static int read_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
>> > + u64 *guest_val, u64 *host_val)
>> > +{
>> > + unsigned i;
>> > + struct msr_autoload *m = &vmx->msr_autoload;
>> > +
>> > + for (i = 0; i < m->nr; ++i)
>> > + if (m->guest[i].index == msr)
>> > + break;
>> > +
>> > + if (i == m->nr)
>> > + return 1;
>> > +
>> > + if (guest_val)
>> > + *guest_val = m->guest[i].value;
>> > + if (host_val)
>> > + *host_val = m->host[i].value;
>> > +
>> > + return 0;
>> > +}
>> > +
>> > static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
>> > {
>> > u64 guest_efer = vmx->vcpu.arch.efer;
>> > @@ -3203,7 +3228,9 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
>> > */
>> > static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > {
>> > + u64 spec_ctrl = 0;
>> > struct shared_msr_entry *msr;
>> > + struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >
>> > switch (msr_info->index) {
>> > #ifdef CONFIG_X86_64
>> > @@ -3223,6 +3250,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > case MSR_IA32_TSC:
>> > msr_info->data = guest_read_tsc(vcpu);
>> > break;
>> > + case MSR_IA32_SPEC_CTRL:
>> > + if (!msr_info->host_initiated &&
>> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>>
>> Shouldn't this conjunct be:
>> !(guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
>> guest_cpuid_has(vcpu, X86_FEATURE_STIBP))?
>>
>> > + return 1;
>>
>> What if !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>> !boot_cpu_has(X86_FEATURE_STIBP)? That should also return 1, I think.
>>
>> > +
>> > + /*
>> > + * If the MSR is not in the atomic list yet, then it was never
>> > + * written to. So the MSR value will be '0'.
>> > + */
>> > + read_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, &spec_ctrl, NULL);
>>
>> Why not just add msr_ia32_spec_ctrl to struct vcpu_vmx, so that you
>> don't have to search the atomic switch list?
>>
>> > +
>> > + msr_info->data = spec_ctrl;
>> > + break;
>> > case MSR_IA32_SYSENTER_CS:
>> > msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
>> > break;
>> > @@ -3289,6 +3329,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > int ret = 0;
>> > u32 msr_index = msr_info->index;
>> > u64 data = msr_info->data;
>> > + unsigned long *msr_bitmap;
>> > +
>> > + /*
>> > + * IBRS is not used (yet) to protect the host. Once it does, this
>> > + * variable needs to be a bit smarter.
>> > + */
>> > + u64 host_spec_ctrl = 0;
>> >
>> > switch (msr_index) {
>> > case MSR_EFER:
>> > @@ -3330,6 +3377,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> > case MSR_IA32_TSC:
>> > kvm_write_tsc(vcpu, msr_info);
>> > break;
>> > + case MSR_IA32_SPEC_CTRL:
>> > + if (!msr_info->host_initiated &&
>> > + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>> > + return 1;
>>
>> This looks incomplete. As above, what if
>> !boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
>> !boot_cpu_has(X86_FEATURE_STIBP)?
>> If the host doesn't support MSR_IA32_SPEC_CTRL, you'll get a VMX-abort
>> on loading the host MSRs from the VM-exit MSR load list.
>
> Yikes, right it will #GP.

Worse; it will VMX-abort, which shuts down the logical CPU.

>>
>> Also, what if the value being written is illegal?
>
> You can write garbage and it won't #GP. Granted it should only read
> correct values (0,1,2,or 3).

That may depend on the processor. On HSX processors with ucode 0x3b, I
find that you can write bits 0, 1, and 2 without a #GP, but bits 63:3
do raise #GP. Nonetheless, the virtual CPU implemented by kvm only
supports bits 0 and 1, regardless of the underlying host support, so
it should raise #GP if bits 63:2 are set.

>
> Albeit the spec says nothing about it (except call those regions as reserved
> which would imply - rdmsr ifrst and then 'or' it with what you are wrmsr).
> That of couse would not be the best choice :-(