Re: [PATCH RFC 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.
From: Paolo Bonzini
Date: Thu Mar 09 2017 - 10:17:37 EST
On 29/12/2016 10:26, Liang Li wrote:
> The future Intel CPU will extend the max physical address to 52 bits.
> To support the new physical address width, EPT is extended to support
> 5 level page table.
> This patch add the 5 level EPT and extend shadow page to support
> 5 level paging guest. As the RFC version, this patch enables 5 level
> EPT once the hardware supports, and this is not a good choice because
> 5 level EPT requires more memory access comparing to use 4 level EPT.
> The right thing is to use 5 level EPT only when it's needed, will
> change in the future version.
>
> Signed-off-by: Liang Li <liang.z.li@xxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Cc: Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: "Radim KrÄmÃÅ" <rkrcmar@xxxxxxxxxx>
> ---
> arch/x86/include/asm/kvm_host.h | 3 +-
> arch/x86/include/asm/vmx.h | 1 +
> arch/x86/kvm/cpuid.h | 8 ++
> arch/x86/kvm/mmu.c | 167 +++++++++++++++++++++++++++++++---------
> arch/x86/kvm/mmu_audit.c | 5 +-
> arch/x86/kvm/paging_tmpl.h | 19 ++++-
> arch/x86/kvm/vmx.c | 19 +++--
> arch/x86/kvm/x86.h | 10 +++
> 8 files changed, 184 insertions(+), 48 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index a7066dc..e505dac 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
> #define KVM_NR_VAR_MTRR 8
>
> #define ASYNC_PF_PER_VCPU 64
> +#define PT64_ROOT_5LEVEL 5
>
> enum kvm_reg {
> VCPU_REGS_RAX = 0,
> @@ -310,7 +311,7 @@ struct kvm_pio_request {
> };
>
> struct rsvd_bits_validate {
> - u64 rsvd_bits_mask[2][4];
> + u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL];
> u64 bad_mt_xwr;
> };
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 2b5b2d4..bf2f178 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -442,6 +442,7 @@ enum vmcs_field {
>
> #define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
> #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
> +#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
> #define VMX_EPTP_UC_BIT (1ull << 8)
> #define VMX_EPTP_WB_BIT (1ull << 14)
> #define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index 35058c2..4bdf3dc 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
> return best && (best->ecx & bit(X86_FEATURE_PKU));
> }
>
> +static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_cpuid_entry2 *best;
> +
> + best = kvm_find_cpuid_entry(vcpu, 7, 0);
> + return best && (best->ecx & bit(X86_FEATURE_LA57));
> +}
> +
> static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
> {
> struct kvm_cpuid_entry2 *best;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4c40273..0a56f27 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
> }
>
> struct mmu_page_path {
> - struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL];
> - unsigned int idx[PT64_ROOT_4LEVEL];
> + struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL];
> + unsigned int idx[PT64_ROOT_5LEVEL];
> };
>
> #define for_each_sp(pvec, sp, parents, i) \
> @@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
> !vcpu->arch.mmu.direct_map)
> --iterator->level;
>
> + if (iterator->level == PT64_ROOT_5LEVEL &&
> + vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL &&
> + !vcpu->arch.mmu.direct_map)
> + iterator->level -= 2;
This (and the "if" before it as well) might actually be dead code.
Please remove it in a separate patch.
> if (iterator->level == PT32E_ROOT_LEVEL) {
> iterator->shadow_addr
> = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
> @@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
> if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> return;
>
> - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
> - (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> - vcpu->arch.mmu.direct_map)) {
> + if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
> + (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.direct_map)) ||
> + (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL &&
> + (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL ||
> + vcpu->arch.mmu.direct_map))) {
Same here:
if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL)
should be enough. In general, checking >= PT64_ROOT_4LEVEL is better
IMHO than checking for == PT64_ROOT_4LEVEL || == PT64_ROOT_5LEVEL.
These "if"s basically need to single out PAE. A hypothetical 6-level
page table extension would in all likelihood behave just like 64-bit
LA48 and LA57 paging.
> hpa_t root = vcpu->arch.mmu.root_hpa;
>
> spin_lock(&vcpu->kvm->mmu_lock);
> @@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> struct kvm_mmu_page *sp;
> unsigned i;
>
> - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
> + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
Same here and everywhere else.
> spin_lock(&vcpu->kvm->mmu_lock);
> make_mmu_pages_available(vcpu);
> - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
> + sp = kvm_mmu_get_page(vcpu, 0, 0,
> + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
> ++sp->root_count;
> spin_unlock(&vcpu->kvm->mmu_lock);
> vcpu->arch.mmu.root_hpa = __pa(sp->spt);
> @@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
> * Do we shadow a long mode page table? If so we need to
> * write-protect the guests page table root.
> */
> - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
> + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
> hpa_t root = vcpu->arch.mmu.root_hpa;
>
> MMU_WARN_ON(VALID_PAGE(root));
>
> spin_lock(&vcpu->kvm->mmu_lock);
> make_mmu_pages_available(vcpu);
> - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
> - 0, ACC_ALL);
> + sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
> + vcpu->arch.mmu.root_level, 0, ACC_ALL);
> root = __pa(sp->spt);
> ++sp->root_count;
> spin_unlock(&vcpu->kvm->mmu_lock);
> @@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
> * the shadow page table may be a PAE or a long mode page table.
> */
> pm_mask = PT_PRESENT_MASK;
> - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
> + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL)
> pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
>
> for (i = 0; i < 4; ++i) {
> @@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
> * If we shadow a 32 bit page table with a long mode page
> * table we enter this path.
> */
> - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
> + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
> if (vcpu->arch.mmu.lm_root == NULL) {
> /*
> * The additional page necessary for this is only
> @@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
> return;
>
> vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
> - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
> + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
> hpa_t root = vcpu->arch.mmu.root_hpa;
> sp = page_header(root);
> mmu_sync_children(vcpu, sp);
> @@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
> walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
> {
> struct kvm_shadow_walk_iterator iterator;
> - u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull;
> + u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull;
> int root, leaf;
> bool reserved = false;
>
> @@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
> }
>
> #define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE_LA57 57
> +
> #define PTTYPE PTTYPE_EPT
> #include "paging_tmpl.h"
> #undef PTTYPE
>
> +#define PTTYPE PTTYPE_LA57
> +#include "paging_tmpl.h"
> +#undef PTTYPE
This is not needed. The format for LA57 page tables is the same as for
LA48.
> #define PTTYPE 64
> #include "paging_tmpl.h"
> #undef PTTYPE
> @@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
> rsvd_check->rsvd_bits_mask[1][0] =
> rsvd_check->rsvd_bits_mask[0][0];
> break;
> + case PT64_ROOT_5LEVEL:
> + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
> + nonleaf_bit8_rsvd | rsvd_bits(7, 7);
> + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
> + nonleaf_bit8_rsvd | rsvd_bits(7, 7);
I think the code for this and PT64_ROOT_4LEVEL should be the same
(setting rsvd_bits_mask[x][4] for PT64_ROOT_4LEVEL is okay).
You are assuming that MAXPHYADDR=52, but the Intel whitepaper doesn't
say this is going to be always the case. rsvd_bits in
arch/x86/kvm/mmu.h is not a hot path, feel free to add an
if (e < s)
return 0;
there.
> + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
> + nonleaf_bit8_rsvd | gbpages_bit_rsvd;
> + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd;
> + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd;
> + rsvd_check->rsvd_bits_mask[1][4] =
> + rsvd_check->rsvd_bits_mask[0][4];
> + rsvd_check->rsvd_bits_mask[1][3] =
> + rsvd_check->rsvd_bits_mask[0][3];
> + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
> + gbpages_bit_rsvd | rsvd_bits(13, 29);
> + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
> + rsvd_bits(13, 20); /* large page */
> + rsvd_check->rsvd_bits_mask[1][0] =
> + rsvd_check->rsvd_bits_mask[0][0];
> + break;
> }
> }
>
> @@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
>
> static void
> __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
> - int maxphyaddr, bool execonly)
> + int maxphyaddr, bool execonly, int ept_level)
> {
> u64 bad_mt_xwr;
>
> - rsvd_check->rsvd_bits_mask[0][3] =
> - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
> - rsvd_check->rsvd_bits_mask[0][2] =
> - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> - rsvd_check->rsvd_bits_mask[0][1] =
> - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
> -
> - /* large page */
> - rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
> - rsvd_check->rsvd_bits_mask[1][2] =
> - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
> - rsvd_check->rsvd_bits_mask[1][1] =
> - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
> - rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
> + if (ept_level == 5) {
> + rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7);
Same here, this "if" is not needed at all and the new ept_level argument
shouldn't be required either.
> + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7);
> + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6);
> + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6);
> + rsvd_check->rsvd_bits_mask[0][0] = 0;
> +
> + /* large page */
> + rsvd_check->rsvd_bits_mask[1][4] =
> + rsvd_check->rsvd_bits_mask[0][4];
> + rsvd_check->rsvd_bits_mask[1][3] =
> + rsvd_check->rsvd_bits_mask[0][3];
> + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29);
> + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20);
> + rsvd_check->rsvd_bits_mask[1][0] = 0;
> + } else {
> + rsvd_check->rsvd_bits_mask[0][3] =
> + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
> + rsvd_check->rsvd_bits_mask[0][2] =
> + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> + rsvd_check->rsvd_bits_mask[0][1] =
> + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
> + /* large page */
> + rsvd_check->rsvd_bits_mask[1][3] =
> + rsvd_check->rsvd_bits_mask[0][3];
> + rsvd_check->rsvd_bits_mask[1][2] =
> + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
> + rsvd_check->rsvd_bits_mask[1][1] =
> + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
> + rsvd_check->rsvd_bits_mask[1][0] =
> + rsvd_check->rsvd_bits_mask[0][0];
> + }
>
> bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
> bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
> @@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
> }
>
> static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
> - struct kvm_mmu *context, bool execonly)
> + struct kvm_mmu *context, bool execonly, int ept_level)
> {
> __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
> - cpuid_maxphyaddr(vcpu), execonly);
> + cpuid_maxphyaddr(vcpu), execonly, ept_level);
> }
>
> /*
> @@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void)
> true, true);
> else
> __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
> - boot_cpu_data.x86_phys_bits,
> - false);
> + boot_cpu_data.x86_phys_bits, false,
> + context->shadow_root_level);
>
> }
>
> @@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void)
> struct kvm_mmu *context, bool execonly)
> {
> __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
> - boot_cpu_data.x86_phys_bits, execonly);
> + boot_cpu_data.x86_phys_bits, execonly,
> + context->shadow_root_level);
> }
>
> static void update_permission_bitmask(struct kvm_vcpu *vcpu,
> @@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu,
> paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL);
> }
>
> +static void paging_la57_init_context(struct kvm_vcpu *vcpu,
> + struct kvm_mmu *context)
> +{
> + context->nx = is_nx(vcpu);
> + context->root_level = PT64_ROOT_5LEVEL;
> +
> + reset_rsvds_bits_mask(vcpu, context);
> + update_permission_bitmask(vcpu, context, false);
> + update_pkru_bitmask(vcpu, context, false);
> + update_last_nonleaf_level(vcpu, context);
> +
> + MMU_WARN_ON(!is_pae(vcpu));
> + context->page_fault = paging_la57_page_fault;
> + context->gva_to_gpa = paging_la57_gva_to_gpa;
> + context->sync_page = paging_la57_sync_page;
> + context->invlpg = paging_la57_invlpg;
> + context->update_pte = paging_la57_update_pte;
> + context->shadow_root_level = PT64_ROOT_5LEVEL;
> + context->root_hpa = INVALID_PAGE;
> + context->direct_map = false;
This should be using paging64_init_context_common.
Even better, paging64_init_context could do
int root_level =
is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
paging64_init_context_common(vcpu, context, root_level);
and then you can skip the change in kvm_init_shadow_mmu.
> +}
> +
> static void paging32_init_context(struct kvm_vcpu *vcpu,
> struct kvm_mmu *context)
> {
> @@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
> context->nx = false;
> context->gva_to_gpa = nonpaging_gva_to_gpa;
> context->root_level = 0;
> + } else if (is_la57_mode(vcpu)) {
> + context->nx = is_nx(vcpu);
> + context->root_level = PT64_ROOT_5LEVEL;
> + reset_rsvds_bits_mask(vcpu, context);
> + context->gva_to_gpa = paging_la57_gva_to_gpa;
Please put the
if (is_la57_mode(vcpu))
inside the is_long_mode branch below, since the only difference is
context->root_level.
> } else if (is_long_mode(vcpu)) {
> context->nx = is_nx(vcpu);
> context->root_level = PT64_ROOT_4LEVEL;
> @@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
>
> if (!is_paging(vcpu))
> nonpaging_init_context(vcpu, context);
> + else if (is_la57_mode(vcpu))
> + paging_la57_init_context(vcpu, context);
> else if (is_long_mode(vcpu))
> paging64_init_context(vcpu, context);
> else if (is_pae(vcpu))
> @@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
>
> update_permission_bitmask(vcpu, context, true);
> update_pkru_bitmask(vcpu, context, true);
> - reset_rsvds_bits_mask_ept(vcpu, context, execonly);
> + reset_rsvds_bits_mask_ept(vcpu, context, execonly,
> + context->shadow_root_level);
> reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
> }
> EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
> @@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
> g_context->nx = false;
> g_context->root_level = 0;
> g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
> + } else if (is_la57_mode(vcpu)) {
> + g_context->nx = is_nx(vcpu);
> + g_context->root_level = PT64_ROOT_5LEVEL;
> + reset_rsvds_bits_mask(vcpu, g_context);
> + g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested;
Same here.
> } else if (is_long_mode(vcpu)) {
> g_context->nx = is_nx(vcpu);
> g_context->root_level = PT64_ROOT_4LEVEL;
> diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
> index 2e6996d..bb40094 100644
> --- a/arch/x86/kvm/mmu_audit.c
> +++ b/arch/x86/kvm/mmu_audit.c
> @@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
> if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> return;
>
> - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
> + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
As above, please use >= PT64_ROOT_4LEVEL here.
> hpa_t root = vcpu->arch.mmu.root_hpa;
>
> sp = page_header(root);
> - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL);
> + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
> return;
> }
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index a011054..c126cd3 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
This is not needed.
> @@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
> #define CMPXCHG cmpxchg64
> #define PT_MAX_FULL_LEVELS 2
> #endif
> +#elif PTTYPE == PTTYPE_LA57
> + #define pt_element_t u64
> + #define guest_walker guest_walker_la57
> + #define FNAME(name) paging_la57_##name
> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> + #define PT_LEVEL_BITS PT64_LEVEL_BITS
> + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
> + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
> + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> + #define PT_MAX_FULL_LEVELS 5
> + #define CMPXCHG cmpxchg
> #elif PTTYPE == 32
> #define pt_element_t u32
> #define guest_walker guest_walker32
> @@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
> static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
> {
> unsigned pkeys = 0;
> -#if PTTYPE == 64
> +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
> pte_t pte = {.pte = gpte};
>
> pkeys = pte_flags_pkey(pte_flags(pte));
> @@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> walker->level = mmu->root_level;
> pte = mmu->get_cr3(vcpu);
>
> -#if PTTYPE == 64
> +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
> if (walker->level == PT32E_ROOT_LEVEL) {
> pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
> trace_kvm_mmu_paging_element(pte, walker->level);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 24db5fb..bfc9f0a 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
> return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
> }
>
> +static inline bool cpu_has_vmx_ept_5levels(void)
> +{
> + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
> +}
> +
> static inline bool cpu_has_vmx_ept_ad_bits(void)
> {
> return vmx_capability.ept & VMX_EPT_AD_BIT;
> @@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
> vmx->emulation_required = emulation_required(vcpu);
> }
>
> +static int get_ept_level(void)
> +{
> + if (cpu_has_vmx_ept_5levels())
> + return VMX_EPT_MAX_GAW + 1;
> + return VMX_EPT_DEFAULT_GAW + 1;
> +}
> +
> static u64 construct_eptp(unsigned long root_hpa)
> {
> u64 eptp;
>
> /* TODO write the value reading from MSR */
> eptp = VMX_EPT_DEFAULT_MT |
> - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
> + (get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT;
> if (enable_ept_ad_bits)
> eptp |= VMX_EPT_AD_ENABLE_BIT;
> eptp |= (root_hpa & PAGE_MASK);
For nested virt you need to set the shift to what L1 uses, so I think
you need to add a root_level argument here and in kvm_init_shadow_ept_mmu.
Paolo
> @@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn)
> }
> }
>
> -static int get_ept_level(void)
> -{
> - return VMX_EPT_DEFAULT_GAW + 1;
> -}
> -
> static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
> {
> u8 cache;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index e8ff3e4..26627df 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
> return cs_l;
> }
>
> +static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
> +{
> +#ifdef CONFIG_X86_64
> + return (vcpu->arch.efer & EFER_LMA) &&
> + kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
> +#else
> + return 0;
> +#endif
> +}
> +
> static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
> {
> return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
>