Re: [PATCH v5 06/26] KVM: nSVM: Triple fault if mapping VMCB12 fails on nested #VMEXIT

From: Sean Christopherson

Date: Mon Feb 23 2026 - 19:36:17 EST

On Fri, Feb 06, 2026, Yosry Ahmed wrote:
> KVM currently injects a #GP and hopes for the best if mapping VMCB12
> fails on nested #VMEXIT, and only if the failure mode is -EINVAL.
> Mapping the VMCB12 could also fail if creating host mappings fails.
>
> After the #GP is injected, nested_svm_vmexit() bails early, without
> cleaning up (e.g. KVM_REQ_GET_NESTED_STATE_PAGES is set, is_guest_mode()
> is true, etc). Move mapping VMCB12 a bit later, after leaving guest mode
> and clearing KVM_REQ_GET_NESTED_STATE_PAGES, right before the VMCB12 is
> actually used.
>
> Instead of optionally injecting a #GP, triple fault the guest if mapping
> VMCB12 fails since KVM cannot make a sane recovery. The APM states that
> a #VMEXIT will triple fault if host state is illegal or an exception
> occurs while loading host state, so the behavior is not entirely made
> up.
>
> Also update the WARN_ON() in svm_get_nested_state_pages() to
> WARN_ON_ONCE() to avoid future user-triggeable bugs spamming kernel logs
> and potentially causing issues.
>
> Fixes: cf74a78b229d ("KVM: SVM: Add VMEXIT handler and intercepts")
> CC: stable@xxxxxxxxxxxxxxx
> Co-developed-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Yosry Ahmed <yosry.ahmed@xxxxxxxxx>
> ---
> arch/x86/kvm/svm/nested.c | 25 +++++++++++--------------
> 1 file changed, 11 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index fab0d3d5baa2..830341b0e1f8 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -1121,24 +1121,14 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
> int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> struct kvm_vcpu *vcpu = &svm->vcpu;
> + gpa_t vmcb12_gpa = svm->nested.vmcb12_gpa;
> struct vmcb *vmcb01 = svm->vmcb01.ptr;
> struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
> struct vmcb *vmcb12;
> struct kvm_host_map map;
> - int rc;
> -
> - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
> - if (rc) {
> - if (rc == -EINVAL)
> - kvm_inject_gp(vcpu, 0);
> - return 1;
> - }
> -
> - vmcb12 = map.hva;
>
> /* Exit Guest-Mode */
> leave_guest_mode(vcpu);
> - svm->nested.vmcb12_gpa = 0;
> WARN_ON_ONCE(svm->nested.nested_run_pending);
>
> kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> @@ -1146,8 +1136,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> /* in case we halted in L2 */
> kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
>
> + svm->nested.vmcb12_gpa = 0;
> +
> + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) {
> + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> + return 1;

Returning early isn't entirely correct. In fact, I think it's worse than the
current behavior in many aspects.

By doing leave_guest_mode() and not switching back to vmcb01 and not putting
vcpu->arch.mmu back to root_mmu, the vCPU will be in L1 but with vmcb02 and L2's
MMU active.

The idea I can come up with is to isolate the vmcb12 writes (which is suprisingly
straightforward), and then simply skip the vmcb12 updates. E.g.

---
arch/x86/kvm/svm/nested.c | 95 ++++++++++++++++++++++-----------------
1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fab0d3d5baa2..e8c163d95364 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -639,6 +639,12 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
}

+static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu)
+{
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ to_svm(vcpu)->nested.ctl.virt_ext;
+}
+
static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
bool new_vmcb12 = false;
@@ -703,8 +709,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
vmcb_mark_dirty(vmcb02, VMCB_DR);
}

- if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ if (nested_vmcb12_has_lbrv(vcpu)) {
/*
* Reserved bits of DEBUGCTL are ignored. Be consistent with
* svm_set_msr's definition of reserved bits.
@@ -1118,35 +1123,14 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
}

-int nested_svm_vmexit(struct vcpu_svm *svm)
+static void nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu,
+ struct vmcb *vmcb12,
+ struct vmcb *vmcb02)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- struct vmcb *vmcb01 = svm->vmcb01.ptr;
- struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
- struct vmcb *vmcb12;
- struct kvm_host_map map;
- int rc;
+ struct vcpu_svm *svm = to_svm(vcpu);

- rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
- if (rc) {
- if (rc == -EINVAL)
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- vmcb12 = map.hva;
-
- /* Exit Guest-Mode */
- leave_guest_mode(vcpu);
- svm->nested.vmcb12_gpa = 0;
- WARN_ON_ONCE(svm->nested.nested_run_pending);
-
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
-
- /* in case we halted in L2 */
- kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
-
- /* Give the current vmcb to the guest */
+ if (!vmcb12)
+ return;

vmcb12->save.es = vmcb02->save.es;
vmcb12->save.cs = vmcb02->save.cs;
@@ -1184,14 +1168,53 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
vmcb12->control.next_rip = vmcb02->control.next_rip;

+ if (nested_vmcb12_has_lbrv(vcpu))
+ svm_copy_lbrs(&vmcb12->save, &vmcb02->save);
+
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;

+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
+ KVM_ISA_SVM);
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int rc;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map)) {
+ vmcb12 = map.hva;
+ } else {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ vmcb12 = NULL;
+ }
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(vcpu);
+ svm->nested.vmcb12_gpa = 0;
+ WARN_ON_ONCE(svm->nested.nested_run_pending);
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ /* in case we halted in L2 */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ /* Give the current vmcb to the guest */
+ nested_svm_vmexit_update_vmcb12(vcpu, vmcb12, vmcb02);
+
if (!kvm_pause_in_guest(vcpu->kvm)) {
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
-
}

/*
@@ -1232,10 +1255,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_exit_on_intr(svm))
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);

- if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
- svm_copy_lbrs(&vmcb12->save, &vmcb02->save);
- } else {
+ if (!nested_vmcb12_has_lbrv(vcpu)) {
svm_copy_lbrs(&vmcb01->save, &vmcb02->save);
vmcb_mark_dirty(vmcb01, VMCB_LBR);
}
@@ -1291,13 +1311,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vcpu.arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(&svm->vcpu);

- trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
- vmcb12->control.exit_info_1,
- vmcb12->control.exit_info_2,
- vmcb12->control.exit_int_info,
- vmcb12->control.exit_int_info_err,
- KVM_ISA_SVM);
-
kvm_vcpu_unmap(vcpu, &map);

nested_svm_transition_tlb_flush(vcpu);

base-commit: 2125912d022f4740238a950469da505783945be6
--