Re: [PATCH 18/28] KVM: x86/mmu: add support for MBEC to EPT page table walks

From: mlevitsk

Date: Tue Jun 02 2026 - 10:39:56 EST


On Tue, 2026-05-05 at 21:52 +0200, Paolo Bonzini wrote:
> Extend the page walker to support moving bit 10 of the PTEs
> into ACC_USER_EXEC_MASK and bit 6 of the exit qualification of
> EPT violation VM exits.
>
> Note that while mmu_has_mbec()/cr4_smep affect the interpretation of
> ACC_USER_EXEC_MASK and add bit 10 as a "present bit" in guest EPT page
> table entries, they do not affect how KVM operates on SPTEs.  That's
> because the MMU uses explicit ACC_USER_EXEC_MASK/shadow_xu_mask even for
> the non-nested EPT; the only difference is that ACC_USER_EXEC_MASK and
> ACC_EXEC_MASK will always be set in tandem outside the nested scenario.
>
> Tested-by: David Riley <d.riley@xxxxxxxxxxx>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
>  arch/x86/kvm/mmu/mmu.c         | 13 +++++++++++--
>  arch/x86/kvm/mmu/paging_tmpl.h | 27 +++++++++++++++++++++------
>  arch/x86/kvm/mmu/spte.h        |  2 ++
>  arch/x86/kvm/vmx/nested.c      |  9 +++++++++
>  4 files changed, 43 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 156050e22329..a5b68f18b220 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5570,7 +5570,6 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
>  {
>   unsigned index;
>  
> - const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK);
>   const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK);
>   const u16 r = ACC_BITS_MASK(ACC_READ_MASK);
>  
> @@ -5611,8 +5610,18 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
>   u16 smapf = 0;
>  
>   if (ept) {
> - ff = (pfec & PFERR_FETCH_MASK) ? (u16)~x : 0;
> + const u16 xs = ACC_BITS_MASK(ACC_EXEC_MASK);
> + const u16 xu = ACC_BITS_MASK(ACC_USER_EXEC_MASK);
> +
> + if (pfec & PFERR_FETCH_MASK) {
> + /* Ignore XU unless MBEC is enabled.  */
> + if (cr4_smep)

Hi!

If we go with my suggestion to call GMET/MBE with vendor neutral name, we won't need to use cr4_smep here
which is confusing.
What do you think?


> + ff = pfec & PFERR_USER_MASK ? (u16)~xu : (u16)~xs;
> + else
> + ff = (u16)~xs;
> + }
>   } else {
> + const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK);
>   const u16 u = ACC_BITS_MASK(ACC_USER_MASK);
>  
>   /* Faults from kernel mode accesses to user pages */
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index d4ce55195a7c..f741f7d4cc2d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -124,12 +124,17 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce
>   *access &= mask;
>  }
>  
> -static inline int FNAME(is_present_gpte)(unsigned long pte)
> +static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu,
> + unsigned long pte)
>  {
>  #if PTTYPE != PTTYPE_EPT
>   return pte & PT_PRESENT_MASK;
>  #else
> - return pte & 7;
> + /*
> + * For EPT, an entry is present if any of bits 2:0 are set.
> + * With mode-based execute control, bit 10 also indicates presence.
> + */
> + return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0));
>  #endif
>  }
>  
> @@ -152,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>     struct kvm_mmu_page *sp, u64 *spte,
>     u64 gpte)
>  {
> - if (!FNAME(is_present_gpte)(gpte))
> + if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte))
>   goto no_present;
>  
>   /* Prefetch only accessed entries (unless A/D bits are disabled). */
> @@ -173,10 +178,17 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>  static inline unsigned FNAME(gpte_access)(u64 gpte)
>  {
>   unsigned access;
> + /*
> + * Set bits in ACC_*_MASK even if they might not be used in the
> + * actual checks.  For example, if EFER.NX is clear permission_fault()
> + * will ignore ACC_EXEC_MASK, and if MBEC is disabled it will
> + * ignore ACC_USER_EXEC_MASK.
> + */
>  #if PTTYPE == PTTYPE_EPT
>   access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
>   ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
> - ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0);
> + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0) |
> + ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0);
>  #else
>   /*
>   * P is set here, so the page is always readable and W/U/!NX represent
> @@ -331,7 +343,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   if (walker->level == PT32E_ROOT_LEVEL) {
>   pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
>   trace_kvm_mmu_paging_element(pte, walker->level);
> - if (!FNAME(is_present_gpte)(pte))
> + if (!FNAME(is_present_gpte)(mmu, pte))
>   goto error;
>   --walker->level;
>   }
> @@ -414,7 +426,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   */
>   pte_access = pt_access & (pte ^ walk_nx_mask);
>  
> - if (unlikely(!FNAME(is_present_gpte)(pte)))
> + if (unlikely(!FNAME(is_present_gpte)(mmu, pte)))
>   goto error;
>  
>   if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
> @@ -521,6 +533,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   * ACC_*_MASK flags!
>   */
>   walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
> + if (mmu_has_mbec(mmu))
> + walker->fault.exit_qualification |=
> + EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
>   }
>  #endif
>   walker->fault.address = addr;
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index f5261d993eac..fe9571837fee 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -395,6 +395,8 @@ static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
>  static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
>      u64 pte)
>  {
> + if (pte & VMX_EPT_USER_EXECUTABLE_MASK)
> + pte |= VMX_EPT_EXECUTABLE_MASK;
>   return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
>  }
>  
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 46b65475765d..84f5c25a1f12 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -7452,6 +7452,15 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
>   struct kvm_mmu *mmu = vcpu->arch.mmu;
>  
>   BUG_ON(!mmu_is_nested(vcpu));
> +
> + /*
> + * MBEC differentiates based on the effective U/S bit of
> + * the guest page tables; not the processor CPL.
> + */
> + access &= ~PFERR_USER_MASK;
> + if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
> + access |= PFERR_USER_MASK;

I think that this hunk should be in 'KVM: nVMX: pass advanced EPT violation vmexit info to guest'
or if you adopt my suggestion then in the combined patch.

My case is that this is the second half of extraction of 'PFERR_USER_MASK' for MBE, and it would be nice to have
both in a same patch to see that they share the same MBE imposed weirdness of utilizing the U bit from the guest x86 walk.

It might even be worth it to extract this hunk and the '__vmx_handle_ept_violation' hunk to a separate patch.
This can strengthen the explanation of why this had to be done this way.

> +
>   return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
>  }
>  

Reviewed-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>

Best regards,
Maxim Leivtsky