[PATCH 4/7] KVM: TDX: Kick off vCPUs when SEAMCALL is busy during TD page removal

From: Yan Zhao
Date: Sun Jan 12 2025 - 21:13:50 EST


Kick off all vCPUs and prevent tdh_vp_enter() from executing whenever
tdh_mem_range_block()/tdh_mem_track()/tdh_mem_page_remove() encounters
contention, since the page removal path does not expect error and is less
sensitive to the performance penalty caused by kicking off vCPUs.

Although KVM has protected SEPT zap-related SEAMCALLs with kvm->mmu_lock,
KVM may still encounter TDX_OPERAND_BUSY due to the contention in the TDX
module.
- tdh_mem_track() may contend with tdh_vp_enter().
- tdh_mem_range_block()/tdh_mem_page_remove() may contend with
tdh_vp_enter() and TDCALLs.

Resources SHARED users EXCLUSIVE users
------------------------------------------------------------
TDCS epoch tdh_vp_enter tdh_mem_track
------------------------------------------------------------
SEPT tree tdh_mem_page_remove tdh_vp_enter (0-step mitigation)
tdh_mem_range_block
------------------------------------------------------------
SEPT entry tdh_mem_range_block (Host lock)
tdh_mem_page_remove (Host lock)
tdg_mem_page_accept (Guest lock)
tdg_mem_page_attr_rd (Guest lock)
tdg_mem_page_attr_wr (Guest lock)

Use a TDX specific per-VM flag wait_for_sept_zap along with
KVM_REQ_OUTSIDE_GUEST_MODE to kick off vCPUs and prevent them from entering
TD, thereby avoiding the potential contention. Apply the kick-off and no
vCPU entering only after each SEAMCALL busy error to minimize the window of
no TD entry, as the contention due to 0-step mitigation or TDCALLs is
expected to be rare.

Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
---
arch/x86/kvm/vmx/tdx.c | 62 ++++++++++++++++++++++++++++++++++++------
arch/x86/kvm/vmx/tdx.h | 7 +++++
2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index bb9d914765fc..09677a4cd605 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -312,6 +312,26 @@ static void tdx_clear_page(unsigned long page_pa)
__mb();
}

+static void tdx_no_vcpus_enter_start(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
+}
+
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
static int __tdx_reclaim_page(hpa_t pa)
{
@@ -979,6 +999,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
return EXIT_FASTPATH_NONE;
}

+ /*
+ * Wait until retry of SEPT-zap-related SEAMCALL completes before
+ * allowing vCPU entry to avoid contention with tdh_vp_enter() and
+ * TDCALLs.
+ */
+ if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
trace_kvm_entry(vcpu, force_immediate_exit);

if (pi_test_on(&tdx->pi_desc)) {
@@ -1647,15 +1675,23 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
return -EINVAL;

- do {
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
+ &level_state);
+ if ((err & TDX_OPERAND_BUSY)) {
/*
- * When zapping private page, write lock is held. So no race
- * condition with other vcpu sept operation. Race only with
- * TDH.VP.ENTER.
+ * The second retry is expected to succeed after kicking off all
+ * other vCPUs and prevent them from invoking TDH.VP.ENTER.
*/
+ tdx_no_vcpus_enter_start(kvm);
err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
&level_state);
- } while (unlikely(err == TDX_ERROR_SEPT_BUSY));
+ tdx_no_vcpus_enter_stop(kvm);
+ }

if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE &&
err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) {
@@ -1726,8 +1762,12 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
WARN_ON_ONCE(level != PG_LEVEL_4K);

err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
- if (unlikely(err == TDX_ERROR_SEPT_BUSY))
- return -EAGAIN;
+ if (unlikely(err & TDX_OPERAND_BUSY)) {
+ /* After no vCPUs enter, the second retry is expected to succeed */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
if (KVM_BUG_ON(err, kvm)) {
pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
return -EIO;
@@ -1770,9 +1810,13 @@ static void tdx_track(struct kvm *kvm)

lockdep_assert_held_write(&kvm->mmu_lock);

- do {
+ err = tdh_mem_track(kvm_tdx->tdr_pa);
+ if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY) {
+ /* After no vCPUs enter, the second retry is expected to succeed */
+ tdx_no_vcpus_enter_start(kvm);
err = tdh_mem_track(kvm_tdx->tdr_pa);
- } while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY));
+ tdx_no_vcpus_enter_stop(kvm);
+ }

if (KVM_BUG_ON(err, kvm))
pr_tdx_error(TDH_MEM_TRACK, err);
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 0833d1084331..e369a6f8721b 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -48,6 +48,13 @@ struct kvm_tdx {

/* For KVM_TDX_INIT_MEM_REGION. */
atomic64_t nr_premapped;
+
+ /*
+ * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
+ * not contend with tdh_vp_enter() and TDCALLs.
+ * Set/unset is protected with kvm->mmu_lock.
+ */
+ bool wait_for_sept_zap;
};

/* TDX module vCPU states */
--
2.43.2