[PATCH 50/54] KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic

From: Sean Christopherson
Date: Tue Jun 22 2021 - 14:04:50 EST

Next message: Sean Christopherson: "[PATCH 51/54] KVM: x86/mmu: Drop redundant rsvd bits reset for nested NPT"
Previous message: Sean Christopherson: "[PATCH 49/54] KVM: x86: Enhance comments for MMU roles and nested transition trickiness"
In reply to: Sean Christopherson: "[PATCH 49/54] KVM: x86: Enhance comments for MMU roles and nested transition trickiness"
Next in thread: Paolo Bonzini: "Re: [PATCH 50/54] KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Drop the pre-computed last_nonleaf_level, which is arguably wrong and at
best confusing. Per the comment:

Can have large pages at levels 2..last_nonleaf_level-1.

the intent of the variable would appear to be to track what levels can
_legally_ have large pages, but that intent doesn't align with reality.
The computed value will be wrong for 5-level paging, or if 1gb pages are
not supported.

The flawed code is not a problem in practice, because except for 32-bit
PSE paging, bit 7 is reserved if large pages aren't supported at the
level. Take advantage of this invariant and simply omit the level magic
math for 64-bit page tables (including PAE).

For 32-bit paging (non-PAE), the adjustments are needed purely because
bit 7 is ignored if PSE=0. Retain that logic as is, but make
is_last_gpte() unique per PTTYPE so that the PSE check is avoided for
PAE and EPT paging. In the spirit of avoiding branches, bump the "last
nonleaf level" for 32-bit PSE paging by adding the PSE bit itself.

Note, bit 7 is ignored or has other meaning in CR3/EPTP, but despite
FNAME(walk_addr_generic) briefly grabbing CR3/EPTP in "pte", they are
not PTEs and will blow up all the other gpte helpers.

Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 3 ---
arch/x86/kvm/mmu/mmu.c | 31 -------------------------------
arch/x86/kvm/mmu/paging_tmpl.h | 31 ++++++++++++++++++++++++++++++-
3 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2da8b5ddbd6a..c97b83cf8381 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -457,9 +457,6 @@ struct kvm_mmu {

struct rsvd_bits_validate guest_rsvd_check;

- /* Can have large pages at levels 2..last_nonleaf_level-1. */
- u8 last_nonleaf_level;
-
u64 pdptrs[4]; /* pae */
};

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 34e7a489e71b..7849f53fd874 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4071,26 +4071,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}

-static inline bool is_last_gpte(struct kvm_mmu *mmu,
- unsigned level, unsigned gpte)
-{
- /*
- * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
- * If it is clear, there are no large pages at this level, so clear
- * PT_PAGE_SIZE_MASK in gpte if that is the case.
- */
- gpte &= level - mmu->last_nonleaf_level;
-
- /*
- * PG_LEVEL_4K always terminates. The RHS has bit 7 set
- * iff level <= PG_LEVEL_4K, which for our purpose means
- * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PG_LEVEL_4K - 1;
-
- return gpte & PT_PAGE_SIZE_MASK;
-}
-
#define PTTYPE_EPT 18 /* arbitrary */
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
@@ -4491,15 +4471,6 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu)
}
}

-static void update_last_nonleaf_level(struct kvm_mmu *mmu)
-{
- unsigned root_level = mmu->root_level;
-
- mmu->last_nonleaf_level = root_level;
- if (root_level == PT32_ROOT_LEVEL && is_cr4_pse(mmu))
- mmu->last_nonleaf_level++;
-}
-
static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu)
{
@@ -4509,7 +4480,6 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, mmu);
update_permission_bitmask(mmu, false);
update_pkru_bitmask(mmu);
- update_last_nonleaf_level(mmu);
}

static void paging64_init_context(struct kvm_mmu *context)
@@ -4783,7 +4753,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;

update_permission_bitmask(context, true);
- update_last_nonleaf_level(context);
update_pkru_bitmask(context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index c92e712607b6..ec1de57f3572 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
return pkeys;
}

+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + !!mmu->mmu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
/*
* Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
@@ -421,7 +450,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,

/* Convert to ACC_*_MASK flags for struct guest_walker. */
walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
- } while (!is_last_gpte(mmu, walker->level, pte));
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));

pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
--
2.32.0.288.g62a8d224e6-goog

Next message: Sean Christopherson: "[PATCH 51/54] KVM: x86/mmu: Drop redundant rsvd bits reset for nested NPT"
Previous message: Sean Christopherson: "[PATCH 49/54] KVM: x86: Enhance comments for MMU roles and nested transition trickiness"
In reply to: Sean Christopherson: "[PATCH 49/54] KVM: x86: Enhance comments for MMU roles and nested transition trickiness"
Next in thread: Paolo Bonzini: "Re: [PATCH 50/54] KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]