[PATCH v2 11/15] KVM: x86/tdp_mmu: Reflect tearing down mirror page tables

From: Rick Edgecombe
Date: Thu May 30 2024 - 17:10:46 EST


From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

Integrate hooks for mirroring page table operations for cases where TDX
will zap PTEs or free page tables.

Like other Coco technologies, TDX has the concept of private and shared
memory. For TDX the private and shared mappings are managed on separate
EPT roots. The private half is managed indirectly though calls into a
protected runtime environment called the TDX module, where the shared half
is managed within KVM in normal page tables.

Since calls into the TDX module are relatively slow, walking private page
tables by making calls into the TDX module would not be efficient. Because
of this, previous changes have taught the TDP MMU to keep a mirror root,
which is separate, unmapped TDP root that private operations can be
directed to. Currently this root is disconnected from the guest. Now add
plumbing to "reflect" changes to the mirror to the page tables being
mirrored. Just create the x86_ops for now, leave plumbing the operations
into the TDX module for future patches.

Add two operations for tearing down page tables, one for freeing page
tables (reflect_free_spt) and one for zapping PTEs (reflect_remove_spte).
Define them such that reflect_remove_spte will perform a TLB flush as well.
(in TDX terms "ensure there are no active translations").

TDX MMU support will exclude certain MMU operations, so only plug in the
mirroring x86 ops where they will be needed. For zapping/freeing, only
hook tdp_mmu_iter_set_spte() which is use used for mapping and linking
PTs. Don't bother hooking tdp_mmu_set_spte_atomic() as it is only used for
zapping PTEs in operations unsupported by TDX: zapping collapsible PTEs and
kvm_mmu_zap_all_fast().

In previous changes to address races around concurrent populating using
tdp_mmu_set_spte_atomic(), a solution was introduced to temporarily set
REMOVED_SPTE in the mirrored page tables while performing the "reflect"
operations. Such a solution is not needed for the tear down paths in TDX
as these will always be performed with the mmu_lock held for write.
Sprinkle some KVM_BUG_ON()s to reflect this.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
Co-developed-by: Kai Huang <kai.huang@xxxxxxxxx>
Signed-off-by: Kai Huang <kai.huang@xxxxxxxxx>
Co-developed-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
---
TDX MMU Prep v2:
- Split from "KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU"
- Rename x86_ops from "private" to "reflect"
- In response to "sp->mirrored_spt" rename helpers to "mirrored"
- Remove unused present mirroring support in tdp_mmu_set_spte()
- Merge reflect_zap_spte() into reflect_remove_spte()
- Move mirror zapping logic out of handle_changed_spte()
- Add some KVM_BUG_ONs
---
arch/x86/include/asm/kvm-x86-ops.h | 2 ++
arch/x86/include/asm/kvm_host.h | 8 ++++++
arch/x86/kvm/mmu/tdp_mmu.c | 45 ++++++++++++++++++++++++++++--
3 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 1877d6a77525..dae06afc6038 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -97,6 +97,8 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_OPTIONAL(reflect_link_spt)
KVM_X86_OP_OPTIONAL(reflect_set_spte)
+KVM_X86_OP_OPTIONAL(reflect_free_spt)
+KVM_X86_OP_OPTIONAL(reflect_remove_spte)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 20bb10f22ca6..0df4a31a0df9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1755,6 +1755,14 @@ struct kvm_x86_ops {
int (*reflect_set_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn);

+ /* Update mirrored page tables for page table about to be freed */
+ int (*reflect_free_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ void *mirrored_spt);
+
+ /* Update mirrored page table from spte getting removed, and flush TLB */
+ int (*reflect_remove_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn);
+
bool (*has_wbinvd_exit)(void);

u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 41b1d3f26597..1245f6a48dbe 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -346,6 +346,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}

+static void reflect_removed_spte(struct kvm *kvm, gfn_t gfn,
+ u64 old_spte, u64 new_spte,
+ int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool was_leaf = was_present && is_last_spte(old_spte, level);
+ kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
+ int ret;
+
+ /*
+ * Allow only leaf page to be zapped. Reclaim non-leaf page tables page
+ * at destroying VM.
+ */
+ if (!was_leaf)
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ /* Because write lock is held, operation should success. */
+ ret = static_call(kvm_x86_reflect_remove_spte)(kvm, gfn, level, old_pfn);
+ KVM_BUG_ON(ret, kvm);
+}
+
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -441,6 +464,22 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
old_spte, REMOVED_SPTE, sp->role, shared);
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(static_call(kvm_x86_reflect_free_spt)(kvm, sp->gfn, sp->role.level,
+ kvm_mmu_mirrored_spt(sp)))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->mirrored_spt = NULL;
}

call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -778,9 +817,11 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
role.level = level;
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false);

- /* Don't support setting for the non-atomic case */
- if (is_mirror_sptep(sptep))
+ if (is_mirror_sptep(sptep)) {
+ /* Only support zapping for the non-atomic case */
KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level);
+ }

return old_spte;
}
--
2.34.1