Re: [PATCH 0/2] VMSCAPE optimization for BHI variant

From: Pawan Gupta

Date: Mon Sep 29 2025 - 21:22:17 EST


On Mon, Sep 29, 2025 at 07:12:03AM +0200, Jack Wang wrote:
> From: Pawan Gupta <pawan.kumar.gupta@xxxxxxxxxxxxxxx>
>
> Hi Pawan,
>
> Thx for the patches, I tested them on our Intel SierraForest machine with
> fio 4k randread/randwrite from guest, qemu virtio-blk, noticed nice
> performance improvement comparing to the default IBPB before exit to
> userspace mitigation. eg with default IBPB mitigation fio gets 204k IOPS,
> with this new Clear BHB before exit to userspace gets 323k IOPS.

Thanks for sharing the results.

I realized the LFENCE in the clear_bhb_long_loop() is not required. The
ring3 transition after the loop should be serializing anyways. Below patch
gets rid of that LFENCE. It should give some performance boost as well.

--- 8< ---
From: Pawan Gupta <pawan.kumar.gupta@xxxxxxxxxxxxxxx>
Subject: [PATCH] x86/vmscape: Remove LFENCE from BHB clearing long loop

Long loop is used to clear the branch history when switching from a guest
to host userspace. The LFENCE barrier is not required as ring transition
itself acts as a barrier.

Move the prologue, LFENCE and epilogue out of __CLEAR_BHB_LOOP macro to
allow skipping the LFENCE in the long loop variant. Rename the long loop
function to clear_bhb_long_loop_no_barrier() to reflect the change.

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@xxxxxxxxxxxxxxx>
---
arch/x86/entry/entry_64.S | 32 +++++++++++++++++-----------
arch/x86/include/asm/entry-common.h | 2 +-
arch/x86/include/asm/nospec-branch.h | 4 ++--
3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f5f62af080d8..bb456a3c652e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1525,10 +1525,6 @@ SYM_CODE_END(rewind_stack_and_make_dead)
* Target Selection, rather than taking the slowpath via its_return_thunk.
*/
.macro __CLEAR_BHB_LOOP outer_loop_count:req, inner_loop_count:req
- ANNOTATE_NOENDBR
- push %rbp
- mov %rsp, %rbp
-
movl $\outer_loop_count, %ecx
ANNOTATE_INTRA_FUNCTION_CALL
call 1f
@@ -1560,10 +1556,7 @@ SYM_CODE_END(rewind_stack_and_make_dead)
jnz 1b
.Lret2_\@:
RET
-5: lfence
-
- pop %rbp
- RET
+5:
.endm

/*
@@ -1573,7 +1566,15 @@ SYM_CODE_END(rewind_stack_and_make_dead)
* setting BHI_DIS_S for the guests.
*/
SYM_FUNC_START(clear_bhb_loop)
+ ANNOTATE_NOENDBR
+ push %rbp
+ mov %rsp, %rbp
+
__CLEAR_BHB_LOOP 5, 5
+
+ lfence
+ pop %rbp
+ RET
SYM_FUNC_END(clear_bhb_loop)
EXPORT_SYMBOL_GPL(clear_bhb_loop)
STACK_FRAME_NON_STANDARD(clear_bhb_loop)
@@ -1584,8 +1585,15 @@ STACK_FRAME_NON_STANDARD(clear_bhb_loop)
* protects the kernel, but to mitigate the guest influence on the host
* userspace either IBPB or this sequence should be used. See VMSCAPE bug.
*/
-SYM_FUNC_START(clear_bhb_long_loop)
+SYM_FUNC_START(clear_bhb_long_loop_no_barrier)
+ ANNOTATE_NOENDBR
+ push %rbp
+ mov %rsp, %rbp
+
__CLEAR_BHB_LOOP 12, 7
-SYM_FUNC_END(clear_bhb_long_loop)
-EXPORT_SYMBOL_GPL(clear_bhb_long_loop)
-STACK_FRAME_NON_STANDARD(clear_bhb_long_loop)
+
+ pop %rbp
+ RET
+SYM_FUNC_END(clear_bhb_long_loop_no_barrier)
+EXPORT_SYMBOL_GPL(clear_bhb_long_loop_no_barrier)
+STACK_FRAME_NON_STANDARD(clear_bhb_long_loop_no_barrier)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index b7b9af1b6413..c70454bdd0e3 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -98,7 +98,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER))
indirect_branch_prediction_barrier();
else if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_EXIT_TO_USER))
- clear_bhb_long_loop();
+ clear_bhb_long_loop_no_barrier();

this_cpu_write(x86_pred_flush_pending, false);
}
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 32d52f32a5e7..151f5de1a430 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -388,9 +388,9 @@ extern void write_ibpb(void);

#ifdef CONFIG_X86_64
extern void clear_bhb_loop(void);
-extern void clear_bhb_long_loop(void);
+extern void clear_bhb_long_loop_no_barrier(void);
#else
-static inline void clear_bhb_long_loop(void) {}
+static inline void clear_bhb_long_loop_no_barrier(void) {}
#endif

extern void (*x86_return_thunk)(void);