[PATCH v3] x86/bhi: use TSX abort for mitigation on RTM systems

From: Jon Kohler
Date: Fri Sep 13 2024 - 14:53:01 EST


Introduce ability to mitigate BHI via TSX aborts on systems that
support RTM feature. The assembly for this mitigation was provided by
Intel [1], noted as "Listing 3", which starts and immediately aborts a
TSX transaction and causes the processor to clear the BHB.

Vulnerable systems that do not support RTM or have TSX disabled will
still use the clear_bhb_loop mitigation by default.

Furthermore, on hardware that supports BHI_DIS_S/X86_FEATURE_BHI_CTRL,
do not use hardware mitigation when using BHI_MITIGATION_VMEXIT_ONLY,
as this causes the value of MSR_IA32_SPEC_CTRL to change, inflicting
measurable KVM overhead.

Example:
In a typical eIBRS enabled system, such as Intel SPR, the SPEC_CTRL may
be commonly set to val == 1 to reflect eIBRS enablement; however,
SPEC_CTRL_BHI_DIS_S causes val == 1025. If the guests that KVM is
virtualizing do not also set the guest side value == 1025, KVM will
constantly have to wrmsr toggle the guest vs host value on both entry
and exit, delaying both.

In fact, if the VMM (such as qemu) does not expose BHI_CTRL + the guest
kernel does not understand BHI_CTRL, or the VMM does expose it + the
guest understands BHI_CTRL *but* the guest does not reboot to
reinitialize SPEC_CTRL, the guest val will never equal 1025, making
this overhead both painful and unavoidable.

Testing:
On an Intel SPR 6442Y, using KVM unit tests tscdeadline_immed shows a
~17-18% speedup vs the existing default.

[1] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/branch-history-injection.html

Signed-off-by: Jon Kohler <jon@xxxxxxxxxxx>
Cc: Chao Gao <chao.gao@xxxxxxxxx>
Cc: Daniel Sneddon <daniel.sneddon@xxxxxxxxxxxxxxx>
Cc: Pawan Gupta <pawan.kumar.gupta@xxxxxxxxxxxxxxx>
---
v1: https://lore.kernel.org/kvm/20240912141156.231429-1-jon@xxxxxxxxxxx/
v2: Switch approached to TSX abort, addressed comments from Chao/Pawan
v3: Added changelog here, fixed small issue in v2 in bugs.c

arch/x86/entry/entry_64.S | 24 ++++++++++++++++++++++++
arch/x86/include/asm/cpufeatures.h | 2 ++
arch/x86/include/asm/nospec-branch.h | 8 ++++++--
arch/x86/kernel/cpu/bugs.c | 26 +++++++++++++++++++++-----
4 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1b5be07f8669..64e83caec40b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1552,3 +1552,27 @@ SYM_FUNC_START(clear_bhb_loop)
SYM_FUNC_END(clear_bhb_loop)
EXPORT_SYMBOL_GPL(clear_bhb_loop)
STACK_FRAME_NON_STANDARD(clear_bhb_loop)
+
+/*
+ * Aborting a TSX transactional region by invoking TSX abort also clears
+ * the BHB. This software sequence is an alternative to clear_bhb_loop,
+ * but it only works on processors that support Intel TSX. The TSX
+ * sequence is effective on all current processors with Intel TSX support
+ * that do not enumerate BHI_NO and should not be needed on parts that do
+ * enumerate BHI_NO. This sequence would be effective on all current
+ * processors with Intel TSX support whether or not XBEGIN is configured
+ * to always abort, such as when the IA32_TSX_CTRL (0x122) RTM_DISABLE
+ * control is set.
+ */
+SYM_FUNC_START(clear_bhb_tsx_abort)
+ push %rbp
+ mov %rsp, %rbp
+ xbegin label
+ xabort $0
+ lfence
+label:
+ pop %rbp
+ RET
+SYM_FUNC_END(clear_bhb_tsx_abort)
+EXPORT_SYMBOL_GPL(clear_bhb_tsx_abort)
+STACK_FRAME_NON_STANDARD(clear_bhb_tsx_abort)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index dd4682857c12..c6aa2d758389 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -473,6 +473,8 @@
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
+#define X86_FEATURE_CLEAR_BHB_TSX (21*32 + 6) /* "" Clear branch history at syscall entry using TSX abort */
+#define X86_FEATURE_CLEAR_BHB_TSX_ON_VMEXIT (21*32 + 7) /* "" Clear branch history at vmexit using TSX abort */

/*
* BUG word(s)
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index ff5f1ecc7d1e..915a767b9053 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -328,11 +328,14 @@

#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
- ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
+ ALTERNATIVE_2 "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP, \
+ "call clear_bhb_tsx_abort", X86_FEATURE_CLEAR_BHB_TSX
.endm

.macro CLEAR_BRANCH_HISTORY_VMEXIT
- ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT
+ ALTERNATIVE_2 "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT, \
+ "call clear_bhb_tsx_abort", X86_FEATURE_CLEAR_BHB_TSX_ON_VMEXIT
+
.endm
#else
#define CLEAR_BRANCH_HISTORY
@@ -383,6 +386,7 @@ extern void entry_ibpb(void);

#ifdef CONFIG_X86_64
extern void clear_bhb_loop(void);
+extern void clear_bhb_tsx_abort(void);
#endif

extern void (*x86_return_thunk)(void);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 45675da354f3..4837f3968954 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1662,8 +1662,16 @@ static void __init bhi_select_mitigation(void)
return;
}

- /* Mitigate in hardware if supported */
- if (spec_ctrl_bhi_dis())
+ /*
+ * Mitigate in hardware if appropriate.
+ * Note: for vmexit only, do not mitigate in hardware to avoid changing
+ * the value of MSR_IA32_SPEC_CTRL to include SPEC_CTRL_BHI_DIS_S. If a
+ * guest does not also set their own SPEC_CTRL to include this, KVM has
+ * to toggle on every vmexit and vmentry if the host value does not
+ * match the guest value. Instead, depend on software loop mitigation
+ * only.
+ */
+ if (bhi_mitigation != BHI_MITIGATION_VMEXIT_ONLY && spec_ctrl_bhi_dis())
return;

if (!IS_ENABLED(CONFIG_X86_64))
@@ -1671,13 +1679,21 @@ static void __init bhi_select_mitigation(void)

if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ if (boot_cpu_has(X86_FEATURE_RTM))
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_TSX_ON_VMEXIT);
+ else
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
return;
}

pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_TSX);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_TSX_ON_VMEXIT);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ }
}

static void __init spectre_v2_select_mitigation(void)
--
2.43.0