Re: [PATCH v19 062/130] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU
From: Edgecombe, Rick P
Date: Sat Mar 23 2024 - 19:39:25 EST
On Mon, 2024-02-26 at 00:26 -0800, isaku.yamahata@xxxxxxxxx wrote:
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index efd3fda1c177..bc0767c884f7 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -468,6 +468,7 @@ struct kvm_mmu {
> int (*sync_spte)(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *sp, int i);
> struct kvm_mmu_root_info root;
> + hpa_t private_root_hpa;
Per the conversation about consistent naming between private, shared and mirror: I wonder if this
should be named these with mirror instead of private. Like:
hpa_t mirror_root_hpa;
Since the actual private root is not tracked by KVM.
> union kvm_cpu_role cpu_role;
> union kvm_mmu_page_role root_role;
>
> @@ -1740,6 +1741,16 @@ struct kvm_x86_ops {
> void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
> int root_level);
>
> + int (*link_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> + void *private_spt);
> + int (*free_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> + void *private_spt);
> + int (*set_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> + kvm_pfn_t pfn);
> + int (*remove_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> + kvm_pfn_t pfn);
> + int (*zap_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level);
> +
> bool (*has_wbinvd_exit)(void);
>
> u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 30c86e858ae4..0e0321ad9ca2 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3717,7 +3717,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> goto out_unlock;
>
> if (tdp_mmu_enabled) {
> - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> + if (kvm_gfn_shared_mask(vcpu->kvm) &&
> + !VALID_PAGE(mmu->private_root_hpa)) {
> + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
> + mmu->private_root_hpa = root;
> + }
> + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, false);
> mmu->root.hpa = root;
This has changed now, due to rebase on,
https://lore.kernel.org/lkml/20240111020048.844847-1-seanjc@xxxxxxxxxx/
...to this:
- if (tdp_mmu_enabled)
- return kvm_tdp_mmu_alloc_root(vcpu);
+ if (tdp_mmu_enabled) {
+ if (kvm_gfn_shared_mask(vcpu->kvm) &&
+ !VALID_PAGE(mmu->private_root_hpa)) {
+ r = kvm_tdp_mmu_alloc_root(vcpu, true);
+ if (r)
+ return r;
+ }
+ return kvm_tdp_mmu_alloc_root(vcpu, false);
+ }
I don't see why the !VALID_PAGE(mmu->private_root_hpa) check is needed.
kvm_tdp_mmu_get_vcpu_root_hpa() already has logic to prevent allocating multiple roots with the same
role.
Also, kvm_tdp_mmu_alloc_root() never returns non-zero, even though mmu_alloc_direct_roots() does.
Probably today when there is one caller it makes mmu_alloc_direct_roots() cleaner to just have it
return the always zero value from kvm_tdp_mmu_alloc_root(). Now that there are two calls, I think we
should refactor kvm_tdp_mmu_alloc_root() to return void, and have kvm_tdp_mmu_alloc_root() return 0
manually in this case.
Or maybe instead change it back to returning an hpa_t and then kvm_tdp_mmu_alloc_root() can lose the
"if (private)" logic at the end too.
> } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
> @@ -4627,7 +4632,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
> for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
> int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
> - gfn_t base = gfn_round_for_level(fault->gfn,
> + gfn_t base = gfn_round_for_level(gpa_to_gfn(fault->addr),
> fault->max_level);
>
> if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
> @@ -4662,6 +4667,7 @@ int kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> };
>
> WARN_ON_ONCE(!vcpu->arch.mmu->root_role.direct);
> + fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_shared_mask(vcpu->kvm);
> fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
>
> r = mmu_topup_memory_caches(vcpu, false);
> @@ -6166,6 +6172,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
>
> mmu->root.hpa = INVALID_PAGE;
> mmu->root.pgd = 0;
> + mmu->private_root_hpa = INVALID_PAGE;
> for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
> mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
>
> @@ -7211,6 +7218,12 @@ int kvm_mmu_vendor_module_init(void)
> void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
> {
> kvm_mmu_unload(vcpu);
> + if (tdp_mmu_enabled) {
> + write_lock(&vcpu->kvm->mmu_lock);
> + mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->private_root_hpa,
> + NULL);
> + write_unlock(&vcpu->kvm->mmu_lock);
What is the reason for the special treatment of private_root_hpa here? The rest of the roots are
freed in kvm_mmu_unload(). I think it is because we don't want the mirror to get freed during
kvm_mmu_reset_context()?
Oof. For the sake of trying to justify the code, I'm trying to keep track of the pros and cons of
treating the mirror/private root like a normal one with just a different role bit.
The whole “list of roots” thing seems to date from the shadow paging, where there is is critical to
keep multiple cached shared roots of different CPU modes of the same shadowed page tables. Today
with non-nested TDP, AFAICT, the only different root is for SMM. I guess since the machinery for
managing multiple roots in a list already exists it makes sense to use it for both.
For TDX there are also only two, but the difference is, things need to be done in special ways for
the two roots. You end up with a bunch of loops (for_each_*tdp_mmu_root(), etc) that essentially
process a list of two different roots, but with inner logic tortured to work for the peculiarities
of both private and shared. An easier to read alternative could be to open code both cases.
I guess the major benefit is to keep one set of logic for shadow paging, normal TDP and TDX, but it
makes the logic a bit difficult to follow for TDX compared to looking at it from the normal guest
perspective. So I wonder if making special versions of the TDX root traversing operations might make
the code a little easier to follow. I’m not advocating for it at this point, just still working on
an opinion. Is there any history around this design point?
> + }
> free_mmu_pages(&vcpu->arch.root_mmu);
> free_mmu_pages(&vcpu->arch.guest_mmu);
> mmu_free_memory_caches(vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 002f3f80bf3b..9e2c7c6d85bf 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -6,6 +6,8 @@
> #include <linux/kvm_host.h>
> #include <asm/kvm_host.h>
>
> +#include "mmu.h"
> +
> #ifdef CONFIG_KVM_PROVE_MMU
> #define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
> #else
> @@ -205,6 +207,15 @@ static inline void kvm_mmu_free_private_spt(struct kvm_mmu_page *sp)
> free_page((unsigned long)sp->private_spt);
> }
>
> +static inline gfn_t kvm_gfn_for_root(struct kvm *kvm, struct kvm_mmu_page *root,
> + gfn_t gfn)
> +{
> + if (is_private_sp(root))
> + return kvm_gfn_to_private(kvm, gfn);
> + else
> + return kvm_gfn_to_shared(kvm, gfn);
> +}
> +
It could be branchless, but might not be worth the readability:
static inline gfn_t kvm_gfn_for_root(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn)
{
gfn_t gfn_for_root = kvm_gfn_to_private(kvm, gfn);
gfn_for_root |= -(gfn_t)!is_private_sp(root) & kvm_gfn_shared_mask(kvm);
return gfn_for_root;
}