Re: [PATCH] KVM: nSVM: Use vcpu->arch.cr2 when updating vmcb12 on nested #VMEXIT

From: Yosry Ahmed

Date: Wed Feb 11 2026 - 15:42:15 EST

> So, with all of that in mind, I believe the best we can do is fully defer delivery
> of the exception until it's actually injected, and then apply the quirk to the
> relevant GET APIs.
>
> ---
> arch/x86/kvm/x86.c | 62 +++++++++++++++++++++++++++++-----------------
> 1 file changed, 39 insertions(+), 23 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b0112c515584..e000521dfc8b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -864,9 +864,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
> vcpu->arch.exception.error_code = error_code;
> vcpu->arch.exception.has_payload = has_payload;
> vcpu->arch.exception.payload = payload;
> - if (!is_guest_mode(vcpu))
> - kvm_deliver_exception_payload(vcpu,
> - &vcpu->arch.exception);
> return;
> }
>
> @@ -5532,18 +5529,8 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> -static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> - struct kvm_vcpu_events *events)
> +static struct kvm_queued_exception *kvm_get_exception_to_save(struct kvm_vcpu *vcpu)
> {
> - struct kvm_queued_exception *ex;
> -
> - process_nmi(vcpu);
> -
> -#ifdef CONFIG_KVM_SMM
> - if (kvm_check_request(KVM_REQ_SMI, vcpu))
> - process_smi(vcpu);
> -#endif
> -
> /*
> * KVM's ABI only allows for one exception to be migrated. Luckily,
> * the only time there can be two queued exceptions is if there's a
> @@ -5554,21 +5541,46 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> if (vcpu->arch.exception_vmexit.pending &&
> !vcpu->arch.exception.pending &&
> !vcpu->arch.exception.injected)
> - ex = &vcpu->arch.exception_vmexit;
> - else
> - ex = &vcpu->arch.exception;
> + return &vcpu->arch.exception_vmexit;
> +
> + return &vcpu->arch.exception;
> +}
> +
> +static void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu);
>
> /*
> - * In guest mode, payload delivery should be deferred if the exception
> - * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
> - * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
> - * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
> - * propagate the payload and so it cannot be safely deferred. Deliver
> - * the payload if the capability hasn't been requested.
> + * If KVM_CAP_EXCEPTION_PAYLOAD is disabled, then (prematurely) deliver
> + * the pending exception payload when userspace saves *any* vCPU state
> + * that interacts with exception payloads to avoid breaking userspace.
> + *
> + * Architecturally, KVM must not deliver an exception payload until the
> + * exception is actually injected, e.g. to avoid losing pending #DB
> + * information (which VMX tracks in the VMCS), and to avoid clobbering
> + * state if the exception is never injected for whatever reason. But
> + * if KVM_CAP_EXCEPTION_PAYLOAD isn't enabled, then userspace may or
> + * may not propagate the payload across save+restore, and so KVM can't
> + * safely defer delivery of the payload.
> */
> if (!vcpu->kvm->arch.exception_payload_enabled &&
> ex->pending && ex->has_payload)
> kvm_deliver_exception_payload(vcpu, ex);
> +}
> +
> +static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> + struct kvm_vcpu_events *events)
> +{
> + struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu);
> +
> + process_nmi(vcpu);
> +
> +#ifdef CONFIG_KVM_SMM
> + if (kvm_check_request(KVM_REQ_SMI, vcpu))
> + process_smi(vcpu);
> +#endif
> +
> + kvm_handle_exception_payload_quirk(vcpu);
>
> memset(events, 0, sizeof(*events));
>
> @@ -5747,6 +5759,8 @@ static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
> vcpu->arch.guest_state_protected)
> return -EINVAL;
>
> + kvm_handle_exception_payload_quirk(vcpu);
> +
> memset(dbgregs, 0, sizeof(*dbgregs));
>
> BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
> @@ -12123,6 +12137,8 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
> if (vcpu->arch.guest_state_protected)
> goto skip_protected_regs;
>
> + kvm_handle_exception_payload_quirk(vcpu);
> +

Hmm looking at this again, I realized it also affects the code path from
store_regs(), I think we don't want to prematurely deliver exception
payloads in that path. So maybe it's best to move this to
kvm_arch_vcpu_ioctl_get_sregs() and kvm_arch_vcpu_ioctl()?

The other option is to plumb a boolean that is only set to true in the
ioctl code path.

> kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
> kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
> kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
>
> base-commit: 55671237401edd1ec59276b852b9361cc170915b
> --