Re: [PATCH 14/21] KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table

From: Huang, Kai
Date: Thu Sep 05 2024 - 22:10:37 EST



--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void)
* is KVM may allocate couple of more bytes than needed for
* each VM.
*/
- if (enable_tdx)
+ if (enable_tdx) {
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
sizeof(struct kvm_tdx));
+ /*
+ * Note, TDX may fail to initialize in a later time in
+ * vt_init(), in which case it is not necessary to setup
+ * those callbacks. But making them valid here even
+ * when TDX fails to init later is fine because those
+ * callbacks won't be called if the VM isn't TDX guest.
+ */
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;

Nit: The callbacks in 'struct kvm_x86_ops' have name "external", but TDX callbacks have name "private". Should we rename TDX callbacks to make them aligned?

+ }
return 0;
}
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 6feb3ab96926..b8cd5a629a80 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
+static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ put_page(page);
+}
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ hpa_t hpa = pfn_to_hpa(pfn);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 entry, level_state;
+ u64 err;
+
+ err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state);
+ if (unlikely(err == TDX_ERROR_SEPT_BUSY)) {
+ tdx_unpin(kvm, pfn);
+ return -EAGAIN;
+ }

Nit: Here (and other non-fatal error cases) I think we should return -EBUSY to make it consistent with non-TDX case? E.g., the non-TDX case has:

if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;

And the comment of tdp_mmu_set_spte_atomic() currently says it can only return 0 or -EBUSY. It needs to be patched to reflect it can also return other non-0 errors like -EIO but those are fatal. In terms of non-fatal error I don't think we need another -EAGAIN.

/*
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically

[...]

* Return:
* * 0 - If the SPTE was set.
* * -EBUSY - If the SPTE cannot be set. In this case this function will
* have no side-effects other than setting iter->old_spte to
* the last known value of the spte.
*/

[...]

+
+static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{

[...]

+
+ hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid);
+ do {
+ /*
+ * TDX_OPERAND_BUSY can happen on locking PAMT entry. Because
+ * this page was removed above, other thread shouldn't be
+ * repeatedly operating on this page. Just retry loop.
+ */
+ err = tdh_phymem_page_wbinvd(hpa_with_hkid);
+ } while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX)));

In what case(s) other threads can concurrently lock the PAMT entry, leading to the above BUSY?

[...]

+
+int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ int ret;
+
+ /*
+ * HKID is released when vm_free() which is after closing gmem_fd

From latest dev branch HKID is freed from vt_vm_destroy(), but not vm_free() (which should be tdx_vm_free() btw).

static void vt_vm_destroy(struct kvm *kvm)
{
if (is_td(kvm))
return tdx_mmu_release_hkid(kvm);

vmx_vm_destroy(kvm);
}

Btw, why not have a tdx_vm_destroy() wrapper? Seems all other vt_xx()s have a tdx_xx() but only this one calls tdx_mmu_release_hkid() directly.

+ * which causes gmem invalidation to zap all spte.
+ * Population is only allowed after KVM_TDX_INIT_VM.
+ */

What does the second sentence ("Population ...") meaning? Why is it relevant here?