Re: [PATCH v6 8/8] KVM: VMX: Preserve host's DEBUGCTLMSR_FREEZE_IN_SMM while running the guest

From: mlevitsk
Date: Tue Jun 24 2025 - 16:00:08 EST


On Tue, 2025-06-10 at 16:20 -0700, Sean Christopherson wrote:
> From: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
>
> Set/clear DEBUGCTLMSR_FREEZE_IN_SMM in GUEST_IA32_DEBUGCTL based on the
> host's pre-VM-Enter value, i.e. preserve the host's FREEZE_IN_SMM setting
> while running the guest.  When running with the "default treatment of SMIs"
> in effect (the only mode KVM supports), SMIs do not generate a VM-Exit that
> is visible to host (non-SMM) software, and instead transitions directly
> from VMX non-root to SMM.  And critically, DEBUGCTL isn't context switched
> by hardware on SMI or RSM, i.e. SMM will run with whatever value was
> resident in hardware at the time of the SMI.
>
> Failure to preserve FREEZE_IN_SMM results in the PMU unexpectedly counting
> events while the CPU is executing in SMM, which can pollute profiling and
> potentially leak information into the guest.
>
> Check for changes in FREEZE_IN_SMM prior to every entry into KVM's inner
> run loop, as the bit can be toggled in IRQ context via IPI callback (SMP
> function call), by way of /sys/devices/cpu/freeze_on_smi.
>
> Add a field in kvm_x86_ops to communicate which DEBUGCTL bits need to be
> preserved, as FREEZE_IN_SMM is only supported and defined for Intel CPUs,
> i.e. explicitly checking FREEZE_IN_SMM in common x86 is at best weird, and
> at worst could lead to undesirable behavior in the future if AMD CPUs ever
> happened to pick up a collision with the bit.
>
> Exempt TDX vCPUs, i.e. protected guests, from the check, as the TDX Module
> owns and controls GUEST_IA32_DEBUGCTL.
>
> WARN in SVM if KVM_RUN_LOAD_DEBUGCTL is set, mostly to document that the
> lack of handling isn't a KVM bug (TDX already WARNs on any run_flag).
>
> Lastly, explicitly reload GUEST_IA32_DEBUGCTL on a VM-Fail that is missed
> by KVM but detected by hardware, i.e. in nested_vmx_restore_host_state().
> Doing so avoids the need to track host_debugctl on a per-VMCS basis, as
> GUEST_IA32_DEBUGCTL is unconditionally written by prepare_vmcs02() and
> load_vmcs12_host_state().  For the VM-Fail case, even though KVM won't
> have actually entered the guest, vcpu_enter_guest() will have run with
> vmcs02 active and thus could result in vmcs01 being run with a stale value.
>
> Cc: stable@xxxxxxxxxxxxxxx
> Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
> Co-developed-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 +++++++
>  arch/x86/kvm/vmx/main.c         |  2 ++
>  arch/x86/kvm/vmx/nested.c       |  3 +++
>  arch/x86/kvm/vmx/vmx.c          |  3 +++
>  arch/x86/kvm/vmx/vmx.h          | 15 ++++++++++++++-
>  arch/x86/kvm/x86.c              | 14 ++++++++++++--
>  6 files changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6325369a4b..e59527dd5a0b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1676,6 +1676,7 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
>  enum kvm_x86_run_flags {
>   KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
>   KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
> + KVM_RUN_LOAD_DEBUGCTL = BIT(2),
>  };
>  
>  struct kvm_x86_ops {
> @@ -1706,6 +1707,12 @@ struct kvm_x86_ops {
>   void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
>   void (*vcpu_put)(struct kvm_vcpu *vcpu);
>  
> + /*
> + * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
> + * match the host's value even while the guest is active.
> + */
> + const u64 HOST_OWNED_DEBUGCTL;
> +
>   void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
>   int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
>   int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index c85cbce6d2f6..4a6d4460f947 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -915,6 +915,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   .vcpu_load = vt_op(vcpu_load),
>   .vcpu_put = vt_op(vcpu_put),
>  
> + .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
> +
>   .update_exception_bitmap = vt_op(update_exception_bitmap),
>   .get_feature_msr = vmx_get_feature_msr,
>   .get_msr = vt_op(get_msr),
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 9edce9f411a3..756c42e2d038 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -4860,6 +4860,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
>   WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
>   }
>  
> + /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
> + vmx_reload_guest_debugctl(vcpu);
> +
>   /*
>   * Note that calling vmx_set_{efer,cr0,cr4} is important as they
>   * handle a variety of side effects to KVM's software model.
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 196f33d934d3..70a115d99530 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7371,6 +7371,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
>   if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
>   set_debugreg(vcpu->arch.dr6, 6);
>  
> + if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
> + vmx_reload_guest_debugctl(vcpu);
> +
>   /*
>   * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
>   * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index c20a4185d10a..076af78af151 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -419,12 +419,25 @@ bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
>  
>  static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
>  {
> + WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
> +
> + val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
>   vmcs_write64(GUEST_IA32_DEBUGCTL, val);
>  }
>  
>  static inline u64 vmx_guest_debugctl_read(void)
>  {
> - return vmcs_read64(GUEST_IA32_DEBUGCTL);
> + return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
> +}
> +
> +static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
> +{
> + u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +
> + if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
> + return;
> +
> + vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
>  }


Wouldn't it be better to use kvm_x86_ops.HOST_OWNED_DEBUGCTL here as well
to avoid logic duplication?

Besides this, everything else looks fine to me.


Best regards,
Maxim Levitsky

>  
>  /*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6742eb556d91..811f4db824ab 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10779,7 +10779,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   dm_request_for_irq_injection(vcpu) &&
>   kvm_cpu_accept_dm_intr(vcpu);
>   fastpath_t exit_fastpath;
> - u64 run_flags;
> + u64 run_flags, debug_ctl;
>  
>   bool req_immediate_exit = false;
>  
> @@ -11051,7 +11051,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   set_debugreg(0, 7);
>   }
>  
> - vcpu->arch.host_debugctl = get_debugctlmsr();
> + /*
> + * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
> + * can be modified in IRQ context, e.g. via SMP function calls.  Inform
> + * vendor code if any host-owned bits were changed, e.g. so that the
> + * value loaded into hardware while running the guest can be updated.
> + */
> + debug_ctl = get_debugctlmsr();
> + if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
> +     !vcpu->arch.guest_state_protected)
> + run_flags |= KVM_RUN_LOAD_DEBUGCTL;
> + vcpu->arch.host_debugctl = debug_ctl;
>  
>   guest_timing_enter_irqoff();
>