Re: [PATCH v10 016/108] KVM: TDX: create/destroy VM structure

From: Ackerley Tng
Date: Fri Dec 09 2022 - 14:15:44 EST



In tdx_vm_init, it is possible to have a double-reclaim, which
eventually causes a host crash. I have a selftest that reliably
reproduces this, and I believe the problem is that withiin
tdx_vm_free(), we don't reset kvm->tdcs = NULL and kvm->tdr.added to
false.

+int tdx_vm_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ int ret, i;
+ u64 err;
+
+ ret = tdx_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+
+ ret = tdx_alloc_td_page(&kvm_tdx->tdr);
+ if (ret)
+ goto free_hkid;
+
+ kvm_tdx->tdcs = kcalloc(tdx_caps.tdcs_nr_pages, sizeof(*kvm_tdx->tdcs),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!kvm_tdx->tdcs)
+ goto free_tdr;
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+ ret = tdx_alloc_td_page(&kvm_tdx->tdcs[i]);
+ if (ret)
+ goto free_tdcs;
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto free_tdcs;
+ }
+ cpus_read_lock();
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+ pr_warn("All packages need to have online CPU to create TD. Online CPU and retry.\n");
+ goto free_packages;
+ }
+
+ /*
+ * Acquire global lock to avoid TDX_OPERAND_BUSY:
+ * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
+ * Table (KOT) to track the assigned TDX private HKID. It doesn't spin
+ * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
+ * caller to handle the contention. This is because of time limitation
+ * usable inside the TDX module and OS/VMM knows better about process
+ * scheduling.
+ *
+ * APIs to acquire the lock of KOT:
+ * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
+ * TDH.PHYMEM.CACHE.WB.
+ */
+ mutex_lock(&tdx_lock);
+ err = tdh_mng_create(kvm_tdx->tdr.pa, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+ ret = -EIO;
+ goto free_packages;
+ }
+ tdx_mark_td_page_added(&kvm_tdx->tdr);
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. Avoid this race by
+ * mutex.
+ */
+ mutex_lock(&tdx_mng_key_config_lock[pkg]);
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ &kvm_tdx->tdr.pa, true);
+ mutex_unlock(&tdx_mng_key_config_lock[pkg]);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret)
+ goto teardown;
+
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+ err = tdh_mng_addcx(kvm_tdx->tdr.pa, kvm_tdx->tdcs[i].pa);
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+ ret = -EIO;
+ goto teardown;
+ }
+ tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
+ }
+
+ /*
+ * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated
+ * ioctl() to define the configure CPUID values for the TD.
+ */
+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ tdx_mmu_release_hkid(kvm);
+ tdx_vm_free(kvm);
+ return ret;

If there is some error that causes an exit through teardown,
tdx_vm_free() will be called, which causes the resources to be
freed. However, tdx_vm_free() is called a second time when the selftest
(or qemu) exits, which causes a second reclaim to be performed.

+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+free_tdcs:
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+ if (!kvm_tdx->tdcs[i].va)
+ continue;
+ free_page(kvm_tdx->tdcs[i].va);
+ }
+ kfree(kvm_tdx->tdcs);
+ kvm_tdx->tdcs = NULL;
+free_tdr:
+ if (kvm_tdx->tdr.va) {
+ free_page(kvm_tdx->tdr.va);
+ kvm_tdx->tdr.added = false;
+ kvm_tdx->tdr.va = 0;
+ kvm_tdx->tdr.pa = 0;
+ }
+free_hkid:
+ if (kvm_tdx->hkid != -1)
+ tdx_hkid_free(kvm_tdx);
+ return ret;
+}

The second reclaim is performed because kvm_tdx->tdcs is still set, and
kvm_tdx->tdr.added is still set, so the second two if blocks in
tdx_vm_free() are executed.

+void tdx_vm_free(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ int i;
+
+ /* Can't reclaim or free TD pages if teardown failed. */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (kvm_tdx->tdcs) {
+ for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
+ tdx_reclaim_td_page(&kvm_tdx->tdcs[i]);
+ kfree(kvm_tdx->tdcs);
+ }
+
+ /*
+ * TDX module maps TDR with TDX global HKID. TDX module may access TDR
+ * while operating on TD (Especially reclaiming TDCS). Cache flush with
+ * TDX global HKID is needed.
+ */
+ if (kvm_tdx->tdr.added &&
+ tdx_reclaim_page(kvm_tdx->tdr.va, kvm_tdx->tdr.pa, true,
+ tdx_global_keyid))
+ return;
+
+ free_page(kvm_tdx->tdr.va);
+}

Here's the fix that stopped the crash I was observing

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 2e7916fb72a7..41d1ff1510c3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -405,6 +405,7 @@ void tdx_vm_free(struct kvm *kvm)
for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
tdx_reclaim_td_page(&kvm_tdx->tdcs[i]);
kfree(kvm_tdx->tdcs);
+ kvm_tdx->tdcs = NULL;
}

/*
@@ -418,6 +419,9 @@ void tdx_vm_free(struct kvm *kvm)
return;

free_page(kvm_tdx->tdr.va);
+ kvm_tdx->tdr.added = false;
+ kvm_tdx->tdr.va = 0;
+ kvm_tdx->tdr.pa = 0;
}

static int tdx_do_tdh_mng_key_config(void *param)