[PATCH 2/3] x86/retpoline: Simplify vmexit_fill_RSB()

From: David Woodhouse
Date: Sat Jan 27 2018 - 11:24:57 EST


From: Borislav Petkov <bp@xxxxxxxxx>

Simplify it to call an asm-function instead of pasting 41 insn bytes at
every call site. Also, add alignment to the macro as suggested here:

https://support.google.com/faqs/answer/7625886

[dwmw2: Clean up comments, let it clobber %ebx and just tell the compiler]
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
arch/x86/entry/entry_32.S | 3 +-
arch/x86/entry/entry_64.S | 3 +-
arch/x86/include/asm/asm-prototypes.h | 3 ++
arch/x86/include/asm/nospec-branch.h | 70 ++++-------------------------------
arch/x86/lib/Makefile | 1 +
arch/x86/lib/retpoline.S | 56 ++++++++++++++++++++++++++++
6 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 60c4c34..2a35b1e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -252,7 +252,8 @@ ENTRY(__switch_to_asm)
* exist, overwrite the RSB with entries which capture
* speculative execution to prevent attack.
*/
- FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+ /* Clobbers %ebx */
+ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
#endif

/* restore callee-saved registers */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 63f4320..b4f0098 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -495,7 +495,8 @@ ENTRY(__switch_to_asm)
* exist, overwrite the RSB with entries which capture
* speculative execution to prevent attack.
*/
- FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+ /* Clobbers %rbx */
+ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
#endif

/* restore callee-saved registers */
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 1908214..4d11161 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -38,4 +38,7 @@ INDIRECT_THUNK(dx)
INDIRECT_THUNK(si)
INDIRECT_THUNK(di)
INDIRECT_THUNK(bp)
+asmlinkage void __fill_rsb(void);
+asmlinkage void __clear_rsb(void);
+
#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 19ecb54..df4ececa 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -7,50 +7,6 @@
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>

-/*
- * Fill the CPU return stack buffer.
- *
- * Each entry in the RSB, if used for a speculative 'ret', contains an
- * infinite 'pause; lfence; jmp' loop to capture speculative execution.
- *
- * This is required in various cases for retpoline and IBRS-based
- * mitigations for the Spectre variant 2 vulnerability. Sometimes to
- * eliminate potentially bogus entries from the RSB, and sometimes
- * purely to ensure that it doesn't get empty, which on some CPUs would
- * allow predictions from other (unwanted!) sources to be used.
- *
- * We define a CPP macro such that it can be used from both .S files and
- * inline assembly. It's possible to do a .macro and then include that
- * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
- */
-
-#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
-#define RSB_FILL_LOOPS 16 /* To avoid underflow */
-
-/*
- * Google experimented with loop-unrolling and this turned out to be
- * the optimal version â two calls, each with their own speculation
- * trap should their return address end up getting used, in a loop.
- */
-#define __FILL_RETURN_BUFFER(reg, nr, sp) \
- mov $(nr/2), reg; \
-771: \
- call 772f; \
-773: /* speculation trap */ \
- pause; \
- lfence; \
- jmp 773b; \
-772: \
- call 774f; \
-775: /* speculation trap */ \
- pause; \
- lfence; \
- jmp 775b; \
-774: \
- dec reg; \
- jnz 771b; \
- add $(BITS_PER_LONG/8) * nr, sp;
-
#ifdef __ASSEMBLY__

/*
@@ -121,17 +77,10 @@
#endif
.endm

- /*
- * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
- * monstrosity above, manually.
- */
-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+/* This clobbers the BX register */
+.macro FILL_RETURN_BUFFER nr:req ftr:req
#ifdef CONFIG_RETPOLINE
- ANNOTATE_NOSPEC_ALTERNATIVE
- ALTERNATIVE "jmp .Lskip_rsb_\@", \
- __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
- \ftr
-.Lskip_rsb_\@:
+ ALTERNATIVE "", "call __clear_rsb", \ftr
#endif
.endm

@@ -206,15 +155,10 @@ extern char __indirect_thunk_end[];
static inline void vmexit_fill_RSB(void)
{
#ifdef CONFIG_RETPOLINE
- unsigned long loops;
-
- asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
- ALTERNATIVE("jmp 910f",
- __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
- X86_FEATURE_RETPOLINE)
- "910:"
- : "=r" (loops), ASM_CALL_CONSTRAINT
- : : "memory" );
+ alternative_input("",
+ "call __fill_rsb",
+ X86_FEATURE_RETPOLINE,
+ ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
#endif
}

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index d435c89..d0a3170 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_RETPOLINE) += retpoline.o
+OBJECT_FILES_NON_STANDARD_retpoline.o :=y

obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o

diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index c909961..3040848 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -7,6 +7,7 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
+#include <asm/bitsperlong.h>

.macro THUNK reg
.section .text.__x86.indirect_thunk
@@ -46,3 +47,58 @@ GENERATE_THUNK(r13)
GENERATE_THUNK(r14)
GENERATE_THUNK(r15)
#endif
+
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; lfence; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version â two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+.macro STUFF_RSB nr:req sp:req
+ mov $(\nr / 2), %_ASM_BX
+ .align 16
+771:
+ call 772f
+773: /* speculation trap */
+ pause
+ lfence
+ jmp 773b
+ .align 16
+772:
+ call 774f
+775: /* speculation trap */
+ pause
+ lfence
+ jmp 775b
+ .align 16
+774:
+ dec %_ASM_BX
+ jnz 771b
+ add $((BITS_PER_LONG/8) * \nr), \sp
+.endm
+
+#define RSB_FILL_LOOPS 16 /* To avoid underflow */
+
+ENTRY(__fill_rsb)
+ STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
+ ret
+END(__fill_rsb)
+EXPORT_SYMBOL_GPL(__fill_rsb)
+
+#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
+
+ENTRY(__clear_rsb)
+ STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
+ ret
+END(__clear_rsb)
+EXPORT_SYMBOL_GPL(__clear_rsb)
--
2.7.4