Re: [PATCH v2 12/12] x86/kvm/emulate: Avoid RET for fastops

From: Peter Zijlstra
Date: Mon Nov 11 2024 - 11:27:58 EST


On Mon, Nov 11, 2024 at 12:59:47PM +0100, Peter Zijlstra wrote:

> +/*
> + * All the FASTOP magic above relies on there being *one* instance of this
> + * so it can JMP back, avoiding RET and it's various thunks.
> + */
> +static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
> {
> ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
>
> if (!(ctxt->d & ByteOp))
> fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
>
> - asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
> + asm("push %[flags]; popf \n\t"
> + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
> + ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
> + JMP_NOSPEC
> + "fastop_return: \n\t"
> + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
> + "pushf; pop %[flags]\n"
> : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
> [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
> : "c"(ctxt->src2.val));

Do Andrew is telling me the compiler is free to mess this up... Notably:

https://github.com/llvm/llvm-project/issues/92161

In lieu of that, I wrote the below hack. It makes objtool sad (it don't
like STT_FUNC calling STT_NOTYPE), but it should work if we ever run
into the compiler being daft like that (it should fail to compile
because of the duplicate fastop_return label, so it's not silent
failure).

Wear protective eye gear before continuing...

---
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -429,9 +429,9 @@ static inline void call_depth_return_thu

#ifdef CONFIG_X86_64

-#define __CS_PREFIX \
+#define __CS_PREFIX(reg) \
".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \
- ".ifc %V[thunk_target],\\rs\n" \
+ ".ifc " reg ",\\rs\n" \
".byte 0x2e\n" \
".endif\n" \
".endr\n"
@@ -441,12 +441,12 @@ static inline void call_depth_return_thu
* which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
*/
# define CALL_NOSPEC \
- __CS_PREFIX \
+ __CS_PREFIX("%V[thunk_target]") \
"call __x86_indirect_thunk_%V[thunk_target]\n"

-# define JMP_NOSPEC \
- __CS_PREFIX \
- "jmp __x86_indirect_thunk_%V[thunk_target]\n"
+# define __JMP_NOSPEC(reg) \
+ __CS_PREFIX(reg) \
+ "jmp __x86_indirect_thunk_" reg "\n"

# define THUNK_TARGET(addr) [thunk_target] "r" (addr)

@@ -478,10 +478,10 @@ static inline void call_depth_return_thu
"call *%[thunk_target]\n", \
X86_FEATURE_RETPOLINE_LFENCE)

-# define JMP_NOSPEC \
+# define __JMP_NOSPEC(reg) \
ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
- "jmp *%[thunk_target]\n", \
+ "jmp *%%" reg "\n", \
" jmp 901f;\n" \
" .align 16\n" \
"901: call 903f;\n" \
@@ -490,22 +490,25 @@ static inline void call_depth_return_thu
" jmp 902b;\n" \
" .align 16\n" \
"903: lea 4(%%esp), %%esp;\n" \
- " pushl %[thunk_target];\n" \
+ " pushl %%" reg "\n" \
" ret;\n", \
X86_FEATURE_RETPOLINE, \
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
- "jmp *%[thunk_target]\n", \
+ "jmp *%%" reg "\n", \
X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
+
#else /* No retpoline for C / inline asm */
# define CALL_NOSPEC "call *%[thunk_target]\n"
-# define JMP_NOSPEC "jmp *%[thunk_target]\n"
+# define __JMP_NOSPEC(reg) "jmp *%%" reg "\n"
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif

+# define JMP_NOSPEC __JMP_NOSPEC("%V[thunk_target]")
+
/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
SPECTRE_V2_NONE,
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5039,23 +5039,45 @@ static void fetch_possible_mmx_operand(s
}

/*
+ * Stub written in asm in order to ensure GCC doesn't duplicate the
+ * fastop_return: label.
+ *
+ * Custom calling convention.
+ *
+ * __fastop:
+ * ax = ctxt->dst.val
+ * dx = ctxt->src.val
+ * cx = ctxt->src.val2
+ * di = flags
+ * si = fop
+ */
+asm (ASM_FUNC_ALIGN
+ "__fastop: \n\t"
+ "push %" _ASM_DI "\n\t"
+ "popf \n\t"
+ UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
+ ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
+ __JMP_NOSPEC(_ASM_SI)
+ "fastop_return: \n\t"
+ UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
+ "pushf \n\t"
+ "pop %" _ASM_DI "\n\t"
+ ASM_RET
+ ".type __fastop, @notype \n\t"
+ ".size __fastop, . - __fastop \n\t");
+
+/*
* All the FASTOP magic above relies on there being *one* instance of this
* so it can JMP back, avoiding RET and it's various thunks.
*/
-static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
{
ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;

if (!(ctxt->d & ByteOp))
fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;

- asm("push %[flags]; popf \n\t"
- UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
- ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
- JMP_NOSPEC
- "fastop_return: \n\t"
- UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
- "pushf; pop %[flags]\n"
+ asm("call __fastop"
: "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
[thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
: "c"(ctxt->src2.val));