Re: [PATCH v2 11/15] KVM: x86/tdp_mmu: Reflect tearing down mirror page tables
From: Paolo Bonzini
Date: Fri Jun 07 2024 - 07:38:33 EST
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 2 ++
> arch/x86/include/asm/kvm_host.h | 8 ++++++
> arch/x86/kvm/mmu/tdp_mmu.c | 45 ++++++++++++++++++++++++++++--
> 3 files changed, 53 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 1877d6a77525..dae06afc6038 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -97,6 +97,8 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
> KVM_X86_OP(load_mmu_pgd)
> KVM_X86_OP_OPTIONAL(reflect_link_spt)
> KVM_X86_OP_OPTIONAL(reflect_set_spte)
> +KVM_X86_OP_OPTIONAL(reflect_free_spt)
> +KVM_X86_OP_OPTIONAL(reflect_remove_spte)
> KVM_X86_OP(has_wbinvd_exit)
> KVM_X86_OP(get_l2_tsc_offset)
> KVM_X86_OP(get_l2_tsc_multiplier)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 20bb10f22ca6..0df4a31a0df9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1755,6 +1755,14 @@ struct kvm_x86_ops {
> int (*reflect_set_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> kvm_pfn_t pfn);
>
> + /* Update mirrored page tables for page table about to be freed */
> + int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> + void *mirrored_spt);
> +
> + /* Update mirrored page table from spte getting removed, and flush TLB */
> + int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> + kvm_pfn_t pfn);
Again, maybe free_external_spt and zap_external_spte?
Also, please rename the last argument to pfn_for_gfn. I'm not proud of
it, but it took me over 10 minutes to understand if the pfn referred
to the gfn itself, or to the external SP that holds the spte...
There's a possibility that it isn't just me. :)
(In general, this patch took me a _lot_ to review... there were a
couple of places that left me incomprehensibly puzzled, more on this
below).
> bool (*has_wbinvd_exit)(void);
>
> u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 41b1d3f26597..1245f6a48dbe 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> }
>
> +static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn,
> + u64 old_spte, u64 new_spte,
> + int level)
new_spte is not used and can be dropped. Also, tdp_mmu_zap_external_spte?
> +{
> + bool was_present = is_shadow_present_pte(old_spte);
> + bool was_leaf = was_present && is_last_spte(old_spte, level);
Just put it below:
if (!is_shadow_present_pte(old_spte))
return;
/* Here we only care about zapping the external leaf PTEs. */
if (!is_last_spte(old_spte, level))
> + kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
> + int ret;
> +
> + /*
> + * Allow only leaf page to be zapped. Reclaim non-leaf page tables page
This comment left me confused, so I'll try to rephrase and see if I
can explain what happens. Correct me if I'm wrong.
The only paths to handle_removed_pt() are:
- kvm_tdp_mmu_zap_leafs()
- kvm_tdp_mmu_zap_invalidated_roots()
but because kvm_mmu_zap_all_fast() does not operate on mirror roots,
the latter can only happen at VM destruction time.
But it's not clear why it's worth mentioning it here, or even why it
is special at all. Isn't that just what handle_removed_pt() does at
the end? Why does it matter that it's only done at VM destruction
time?
In other words, it seems to me that this comment is TMI. And if I am
wrong (which may well be), the extra information should explain the
"why" in more detail, and it should be around the call to
reflect_free_spt, not here.
> + return;
> + /* Zapping leaf spte is allowed only when write lock is held. */
> + lockdep_assert_held_write(&kvm->mmu_lock);
> + /* Because write lock is held, operation should success. */
> + ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, old_pfn);
> + KVM_BUG_ON(ret, kvm);
> +}
> +
> /**
> * handle_removed_pt() - handle a page table removed from the TDP structure
> *
> @@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
> }
> handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
> old_spte, REMOVED_SPTE, sp->role, shared);
> + if (is_mirror_sp(sp)) {
> + KVM_BUG_ON(shared, kvm);
> + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
> + }
> + }
> +
> + if (is_mirror_sp(sp) &&
> + WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp->role.level,
> + kvm_mmu_mirrored_spt(sp)))) {
Please use base_gfn and level here, instead of fishing them from sp.
> + /*
> + * Failed to free page table page in mirror page table and
> + * there is nothing to do further.
> + * Intentionally leak the page to prevent the kernel from
> + * accessing the encrypted page.
> + */
> + sp->mirrored_spt = NULL;
> }
>
> call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> @@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
> role.level = level;
> handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false);
>
> - /* Don't support setting for the non-atomic case */
> - if (is_mirror_sptep(sptep))
> + if (is_mirror_sptep(sptep)) {
> + /* Only support zapping for the non-atomic case */
Like for patch 10, this comment should point out why we never get here
for mirror SPs.
Paolo
> KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
> + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
> + }
>
> return old_spte;
> }
> --
> 2.34.1
>