Re: [PATCH V2 3/4] KVM: VMX: Don't consult original exit qualification for nested EPT violation injection
From: Sean Christopherson
Date: Tue Feb 24 2026 - 12:33:12 EST
On Tue, Feb 24, 2026, Kevin Cheng wrote:
> ---
> arch/x86/kvm/mmu/paging_tmpl.h | 16 +++++++++++++++-
> arch/x86/kvm/vmx/nested.c | 3 ---
> 2 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index f148c92b606ba..a084b5e50effc 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -386,8 +386,19 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> nested_access, &walker->fault);
>
> if (unlikely(real_gpa == INVALID_GPA)) {
> + /*
> + * Unconditionally set the NPF error_code bits and
> + * EPT exit_qualification bits for nested page
> + * faults. The walker doesn't know whether L1 uses
> + * NPT or EPT, and each injection handler consumes
> + * only the field it cares about (error_code for
> + * NPF, exit_qualification for EPT violations), so
> + * setting both is harmless.
> + */
> #if PTTYPE != PTTYPE_EPT
> walker->fault.error_code |= PFERR_GUEST_PAGE_MASK;
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_GVA_IS_VALID;
This looks all kinds of wrong. Shouldn't it be?
#if PTTYPE == PTTYPE_EPT
walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID;
#else
walker->fault.error_code |= PFERR_GUEST_PAGE_MASK;
#endif
Ooooh, right, never mind, this is the case where KVM detects a fault on the
L2 GPA => L1 GPA translation when walking L2 GVA=>GPA.
This is very counter-intuitive, and unconditionally setting both is rather ugly.
To address both, what if we peek at the guest_mmu to determine whether the fault
is EPT vs. NPT? E.g.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3911ac9bddfd..db43560ba6f8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5263,6 +5263,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
+static bool kvm_nested_fault_is_ept(struct kvm_vcpu *vcpu,
+ struct x86_exception *exception);
+
#define PTTYPE_EPT 18 /* arbitrary */
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
@@ -5276,6 +5279,13 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
#include "paging_tmpl.h"
#undef PTTYPE
+static bool kvm_nested_fault_is_ept(struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
+{
+ WARN_ON_ONCE(!exception->nested_page_fault);
+ return vcpu->arch.guest_mmu.page_fault == ept_page_fault;
+}
+
static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
u64 pa_bits_rsvd, int level, bool nx,
bool gbpages, bool pse, bool amd)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index a084b5e50eff..0c9ce7a4815b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -387,19 +387,14 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
if (unlikely(real_gpa == INVALID_GPA)) {
/*
- * Unconditionally set the NPF error_code bits and
- * EPT exit_qualification bits for nested page
- * faults. The walker doesn't know whether L1 uses
- * NPT or EPT, and each injection handler consumes
- * only the field it cares about (error_code for
- * NPF, exit_qualification for EPT violations), so
- * setting both is harmless.
+ * Set EPT Violation flags even if the fault is an
+ * EPT Misconfig, fault.exit_qualification is ignored
+ * for EPT Misconfigs.
*/
-#if PTTYPE != PTTYPE_EPT
- walker->fault.error_code |= PFERR_GUEST_PAGE_MASK;
- walker->fault.exit_qualification |=
- EPT_VIOLATION_GVA_IS_VALID;
-#endif
+ if (kvm_nested_fault_is_ept(vcpu, &walker->fault))
+ walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID;
+ else
+ walker->fault.error_code |= PFERR_GUEST_PAGE_MASK;
return 0;
}
@@ -458,12 +453,11 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
if (real_gpa == INVALID_GPA) {
-#if PTTYPE != PTTYPE_EPT
- walker->fault.error_code |= PFERR_GUEST_FINAL_MASK;
- walker->fault.exit_qualification |=
- EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED;
-#endif
+ if (kvm_nested_fault_is_ept(vcpu, &walker->fault))
+ walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED;
+ else
+ walker->fault.error_code |= PFERR_GUEST_FINAL_MASK;
return 0;
}
> #endif
> return 0;
> }
> @@ -449,6 +460,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> if (real_gpa == INVALID_GPA) {
> #if PTTYPE != PTTYPE_EPT
> walker->fault.error_code |= PFERR_GUEST_FINAL_MASK;
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_GVA_IS_VALID |
> + EPT_VIOLATION_GVA_TRANSLATED;
> #endif
> return 0;
> }
> @@ -496,7 +510,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> * [2:0] - Derive from the access bits. The exit_qualification might be
> * out of date if it is serving an EPT misconfiguration.
> * [5:3] - Calculated by the page walk of the guest EPT page tables
> - * [7:8] - Derived from [7:8] of real exit_qualification
> + * [7:8] - Set at the kvm_translate_gpa() call sites above
> *
> * The other bits are set to 0.
> */
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 248635da67661..6a167b1d51595 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -444,9 +444,6 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
> exit_qualification = 0;
> } else {
> exit_qualification = fault->exit_qualification;
> - exit_qualification |= vmx_get_exit_qual(vcpu) &
> - (EPT_VIOLATION_GVA_IS_VALID |
> - EPT_VIOLATION_GVA_TRANSLATED);
Hmm, this isn't quite correct. If KVM injects an EPT Violation (or a #NPF) when
handling an EPT Violation (or #NPF) from L2, then KVM _should_ follow hardware.
Aha! I think the easiest way to deal with that is to flag nested page faults
that were the result of walking L1's TDP when handling an L2 TDP page fault, and
then let vendor code extract the fault information out of hardaware.
Alternatively, we could plumb in VMX's EPT_VIOLATION_GVA_IS_VALID as a synthetic
error code, but I think that be harder to follow overall, especially for VMX.
@@ -799,8 +793,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
- if (!fault->prefetch)
+ if (!fault->prefetch) {
+ walker.fault.hardware_nested_page_fault = walker.fault.nested_page_fault;
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
+ }
return RET_PF_RETRY;
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6a167b1d5159..9e7d541d256b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -443,8 +443,13 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
exit_qualification = 0;
} else {
- exit_qualification = fault->exit_qualification;
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+
+ exit_qualification = fault->exit_qualification;
+ if (fault->hardware_nested_page_fault)
+ exit_qualification |= vmx_get_exit_qual(vcpu) &
+ (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
}
/*
> vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
> }
>
> --
> 2.53.0.414.gf7e9f6c205-goog
>