Re: [PATCH v2 19/25] KVM: nVMX: Add FRED VMCS fields

From: Sean Christopherson
Date: Thu Jun 13 2024 - 14:29:52 EST


On Wed, Feb 07, 2024, Xin Li wrote:
> Add FRED VMCS fields to nested VMX context management.
>
> Todo: change VMCS12_REVISION, as struct vmcs12 is changed.

It actually doesn't, the comment is just stale. At this point, KVM must _never_
change VMCS12_REVISION as doing so will break backwards compatibility.

I'll post this once I've written a changelog:

---
arch/x86/kvm/vmx/vmcs12.h | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index edf7fcef8ccf..d67bebb9f1c2 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -207,11 +207,9 @@ struct __packed vmcs12 {
};

/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ * VMCS12_REVISION is KVM's arbitrary id for the layout of struct vmcs12.
*
- * IMPORTANT: Changing this value will break save/restore compatibility with
+ * DO NOT change this value, as it will break save/restore compatibility with
* older kvm releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -225,7 +223,8 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE

/*
- * For save/restore compatibility, the vmcs12 field offsets must not change.
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
*/
#define CHECK_OFFSET(field, loc) \
ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)

base-commit: 878fe4c2f7eead383f2b306cbafd300006dd518c
--

> Signed-off-by: Xin Li <xin3.li@xxxxxxxxx>
> Tested-by: Shan Kang <shan.kang@xxxxxxxxx>
> ---
>
> Change since v1:
> * Remove hyperv TLFS related changes (Jeremi Piotrowski).
> * Use kvm_cpu_cap_has() instead of cpu_feature_enabled() (Chao Gao).
> ---
> Documentation/virt/kvm/x86/nested-vmx.rst | 18 +++++
> arch/x86/kvm/vmx/nested.c | 91 +++++++++++++++++++----
> arch/x86/kvm/vmx/vmcs12.c | 18 +++++
> arch/x86/kvm/vmx/vmcs12.h | 36 +++++++++
> arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 +
> 5 files changed, 152 insertions(+), 15 deletions(-)
>
> diff --git a/Documentation/virt/kvm/x86/nested-vmx.rst b/Documentation/virt/kvm/x86/nested-vmx.rst
> index e64ef231f310..87fa9f3877ab 100644
> --- a/Documentation/virt/kvm/x86/nested-vmx.rst
> +++ b/Documentation/virt/kvm/x86/nested-vmx.rst
> @@ -218,6 +218,24 @@ struct shadow_vmcs is ever changed.
> u16 host_gs_selector;
> u16 host_tr_selector;
> u64 secondary_vm_exit_controls;
> + u64 guest_ia32_fred_config;
> + u64 guest_ia32_fred_rsp1;
> + u64 guest_ia32_fred_rsp2;
> + u64 guest_ia32_fred_rsp3;
> + u64 guest_ia32_fred_stklvls;
> + u64 guest_ia32_fred_ssp1;
> + u64 guest_ia32_fred_ssp2;
> + u64 guest_ia32_fred_ssp3;
> + u64 host_ia32_fred_config;
> + u64 host_ia32_fred_rsp1;
> + u64 host_ia32_fred_rsp2;
> + u64 host_ia32_fred_rsp3;
> + u64 host_ia32_fred_stklvls;
> + u64 host_ia32_fred_ssp1;
> + u64 host_ia32_fred_ssp2;
> + u64 host_ia32_fred_ssp3;
> + u64 injected_event_data;
> + u64 original_event_data;
> };
>
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 94da6a0a2f81..f9c1fbeac302 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -686,6 +686,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>
> nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> +
> + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> + MSR_IA32_FRED_RSP0, MSR_TYPE_RW);
> #endif
> nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
> @@ -2498,6 +2501,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
> vmcs12->vm_entry_instruction_len);
> vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> vmcs12->guest_interruptibility_info);
> + if (kvm_cpu_cap_has(X86_FEATURE_FRED))

This is wrong, vmcs02 should be set from vmcs12 if and only if the field is enabled
in L1's VMX configuration, i.e. iff nested_cpu_has(vmcs12, ???).

Note, the ??? should be tied to whatever VMX MSR feature flag enumerates
INJECTED_EVENT_DATA. KVM's clearing of X86_FEATURE_FRED when one or more pieces
is missing is a software decision, i.e. not archictectural.

> + vmcs_write64(INJECTED_EVENT_DATA, vmcs12->injected_event_data);
> vmx->loaded_vmcs->nmi_known_unmasked =
> !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
> } else {
> @@ -2548,6 +2553,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
> vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
>
> + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) {

Same thing here.

> + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config);
> + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1);
> + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2);
> + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3);
> + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls);
> + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1);
> + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2);
> + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3);
> + }
> +
> vmx->segment_cache.bitmask = 0;
> }
>
> @@ -3835,6 +3851,22 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> vcpu->arch.cr4_guest_owned_bits));
> }
>
> +static inline unsigned long
> +nested_vmx_get_event_data(struct kvm_vcpu *vcpu, bool for_ex_vmexit)

Heh, two form letters for the price of one:

#1
Do not use "inline" for functions that are visible only to the local compilation
unit. "inline" is just a hint, and modern compilers are smart enough to inline
functions when appropriate without a hint.

A longer explanation/rant here: https://lore.kernel.org/all/ZAdfX+S323JVWNZC@xxxxxxxxxx

#2
Do not wrap before the function name. Linus has a nice explanation/rant on this[*].

[*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@xxxxxxxxxxxxxx

> +{
> + struct kvm_queued_exception *ex = for_ex_vmexit ?
> + &vcpu->arch.exception_vmexit : &vcpu->arch.exception;
> +
> + if (ex->has_payload)
> + return ex->payload;
> + else if (ex->vector == PF_VECTOR)
> + return vcpu->arch.cr2;
> + else if (ex->vector == DB_VECTOR)
> + return (vcpu->arch.dr6 & ~DR6_BT) ^ DR6_ACTIVE_LOW;
> + else
> + return 0;

I'll circle back to this on the next version, i.e. after it's reworked to account
for the suggested payload changes. I highly doubt it's correct as-is.

> static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12,
> u32 vm_exit_reason, u32 exit_intr_info)
> @@ -3842,6 +3874,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
> u32 idt_vectoring;
> unsigned int nr;
>
> + vmcs12->original_event_data = 0;
> +
> /*
> * Per the SDM, VM-Exits due to double and triple faults are never
> * considered to occur during event delivery, even if the double/triple
> @@ -3880,6 +3914,12 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
> vcpu->arch.exception.error_code;
> }
>
> + idt_vectoring |= vcpu->arch.exception.nested ?
> + INTR_INFO_NESTED_EXCEPTION_MASK : 0;

Please stop using ternary operators this way. It's less readable and the same
number of lines as:

if (vcpu->arch.exception.nested)
idt_vectoring |= INTR_INFO_NESTED_EXCEPTION_MASK;

> +
> + vmcs12->original_event_data =
> + nested_vmx_get_event_data(vcpu, false);
> +
> vmcs12->idt_vectoring_info_field = idt_vectoring;
> } else if (vcpu->arch.nmi_injected) {
> vmcs12->idt_vectoring_info_field =
> @@ -3970,19 +4010,7 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
> struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
> u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
> struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> - unsigned long exit_qual;
> -
> - if (ex->has_payload) {
> - exit_qual = ex->payload;
> - } else if (ex->vector == PF_VECTOR) {
> - exit_qual = vcpu->arch.cr2;
> - } else if (ex->vector == DB_VECTOR) {
> - exit_qual = vcpu->arch.dr6;
> - exit_qual &= ~DR6_BT;
> - exit_qual ^= DR6_ACTIVE_LOW;
> - } else {
> - exit_qual = 0;
> - }
> + unsigned long exit_qual = nested_vmx_get_event_data(vcpu, true);

This can't possibly be correct, EXIT_QUAL and EVENT_DATA aren't equivalent, e.g.
the former doesn't have XFD_ERR, but the latter does.

> /*
> * Unlike AMD's Paged Real Mode, which reports an error code on #PF
> @@ -4003,10 +4031,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
> intr_info |= INTR_INFO_DELIVER_CODE_MASK;
> }
>
> - if (kvm_exception_is_soft(ex->vector))
> + if (kvm_exception_is_soft(ex->vector)) {
> intr_info |= INTR_TYPE_SOFT_EXCEPTION;
> - else
> + } else {
> intr_info |= INTR_TYPE_HARD_EXCEPTION;
> + intr_info |= ex->nested ? INTR_INFO_NESTED_EXCEPTION_MASK : 0;

Again,

if (ex->nested)
intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK;

> + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) {

And here

> + vmcs12->guest_ia32_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG);
> + vmcs12->guest_ia32_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1);
> + vmcs12->guest_ia32_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2);
> + vmcs12->guest_ia32_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3);
> + vmcs12->guest_ia32_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS);
> + vmcs12->guest_ia32_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1);
> + vmcs12->guest_ia32_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2);
> + vmcs12->guest_ia32_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3);
> + }
> +
> vmcs12->guest_pending_dbg_exceptions =
> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>
> @@ -4625,6 +4675,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
> vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
> vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
>
> + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) {

And here