Re: [PATCH v8 03/12] x86/retpoline: Add initial retpoline support

From: Tom Lendacky
Date: Thu Jan 11 2018 - 18:58:31 EST


On 1/11/2018 3:46 PM, David Woodhouse wrote:
> Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide
> the corresponding thunks. Provide assembler macros for invoking the thunks
> in the same way that GCC does, from native and inline assembler.
>
> This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In
> some circumstances, IBRS microcode features may be used instead, and the
> retpoline can be disabled.
>
> On AMD CPUs if lfence is serialising, the retpoline can be dramatically
> simplified to a simple "lfence; jmp *\reg". A future patch, after it has
> been verified that lfence really is serialising in all circumstances, can
> enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition
> to X86_FEATURE_RETPOLINE.
>
> Do not align the retpoline in the altinstr section, because there is no
> guarantee that it stays aligned when it's copied over the oldinstr during
> alternative patching.
>
> [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks]
> [ tglx: Put actual function CALL/JMP in front of the macros, convert to
> symbolic labels ]
> [ dwmw2: Convert back to numeric labels, merge objtool fixes ]
>
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Acked-by: Arjan van de Ven <arjan@xxxxxxxxxxxxxxx>
> Acked-by: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: gnomes@xxxxxxxxxxxxxxxxxxx
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Cc: Jiri Kosina <jikos@xxxxxxxxxx>
> Cc: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxx>
> Cc: Kees Cook <keescook@xxxxxxxxxx>
> Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
> Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxxx>
> Cc: Paul Turner <pjt@xxxxxxxxxx>
> Link: https://lkml.kernel.org/r/1515508997-6154-2-git-send-email-dwmw@xxxxxxxxxxxx
> ---
> arch/x86/Kconfig | 13 ++++
> arch/x86/Makefile | 10 +++
> arch/x86/include/asm/asm-prototypes.h | 25 +++++++
> arch/x86/include/asm/cpufeatures.h | 2 +
> arch/x86/include/asm/nospec-branch.h | 128 ++++++++++++++++++++++++++++++++++
> arch/x86/kernel/cpu/common.c | 4 ++
> arch/x86/lib/Makefile | 1 +
> arch/x86/lib/retpoline.S | 48 +++++++++++++
> 8 files changed, 231 insertions(+)
> create mode 100644 arch/x86/include/asm/nospec-branch.h
> create mode 100644 arch/x86/lib/retpoline.S
>

...

> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> new file mode 100644
> index 0000000..e20e92e
> --- /dev/null
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -0,0 +1,128 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __NOSPEC_BRANCH_H__
> +#define __NOSPEC_BRANCH_H__
> +
> +#include <asm/alternative.h>
> +#include <asm/alternative-asm.h>
> +#include <asm/cpufeatures.h>
> +
> +#ifdef __ASSEMBLY__
> +
> +/*
> + * This should be used immediately before a retpoline alternative. It tells
> + * objtool where the retpolines are so that it can make sense of the control
> + * flow by just reading the original instruction(s) and ignoring the
> + * alternatives.
> + */
> +.macro ANNOTATE_NOSPEC_ALTERNATIVE
> + .Lannotate_\@:
> + .pushsection .discard.nospec
> + .long .Lannotate_\@ - .
> + .popsection
> +.endm
> +
> +/*
> + * These are the bare retpoline primitives for indirect jmp and call.
> + * Do not use these directly; they only exist to make the ALTERNATIVE
> + * invocation below less ugly.
> + */
> +.macro RETPOLINE_JMP reg:req
> + call .Ldo_rop_\@
> +.Lspec_trap_\@:
> + pause

Talked with our engineers some more on using pause vs. lfence. Pause is
not serializing on AMD, so the pause/jmp loop will use power as it is
speculated over waiting for return to mispredict to the correct target.
Can this be changed back to lfence? It looked like a very small
difference in cycles/time.

Thanks,
Tom

> + jmp .Lspec_trap_\@
> +.Ldo_rop_\@:
> + mov \reg, (%_ASM_SP)
> + ret
> +.endm
> +
> +/*
> + * This is a wrapper around RETPOLINE_JMP so the called function in reg
> + * returns to the instruction after the macro.
> + */
> +.macro RETPOLINE_CALL reg:req
> + jmp .Ldo_call_\@
> +.Ldo_retpoline_jmp_\@:
> + RETPOLINE_JMP \reg
> +.Ldo_call_\@:
> + call .Ldo_retpoline_jmp_\@
> +.endm
> +
> +/*
> + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
> + * indirect jmp/call which may be susceptible to the Spectre variant 2
> + * attack.
> + */
> +.macro JMP_NOSPEC reg:req
> +#ifdef CONFIG_RETPOLINE
> + ANNOTATE_NOSPEC_ALTERNATIVE
> + ALTERNATIVE_2 __stringify(jmp *\reg), \
> + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
> + __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
> +#else
> + jmp *\reg
> +#endif
> +.endm
> +
> +.macro CALL_NOSPEC reg:req
> +#ifdef CONFIG_RETPOLINE
> + ANNOTATE_NOSPEC_ALTERNATIVE
> + ALTERNATIVE_2 __stringify(call *\reg), \
> + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
> + __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
> +#else
> + call *\reg
> +#endif
> +.endm
> +
> +#else /* __ASSEMBLY__ */
> +
> +#define ANNOTATE_NOSPEC_ALTERNATIVE \
> + "999:\n\t" \
> + ".pushsection .discard.nospec\n\t" \
> + ".long 999b - .\n\t" \
> + ".popsection\n\t"
> +
> +#if defined(CONFIG_X86_64) && defined(RETPOLINE)
> +
> +/*
> + * Since the inline asm uses the %V modifier which is only in newer GCC,
> + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
> + */
> +# define CALL_NOSPEC \
> + ANNOTATE_NOSPEC_ALTERNATIVE \
> + ALTERNATIVE( \
> + "call *%[thunk_target]\n", \
> + "call __x86_indirect_thunk_%V[thunk_target]\n", \
> + X86_FEATURE_RETPOLINE)
> +# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
> +
> +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
> +/*
> + * For i386 we use the original ret-equivalent retpoline, because
> + * otherwise we'll run out of registers. We don't care about CET
> + * here, anyway.
> + */
> +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \
> + " jmp 904f;\n" \
> + " .align 16\n" \
> + "901: call 903f;\n" \
> + "902: pause;\n" \
> + " jmp 902b;\n" \
> + " .align 16\n" \
> + "903: addl $4, %%esp;\n" \
> + " pushl %[thunk_target];\n" \
> + " ret;\n" \
> + " .align 16\n" \
> + "904: call 901b;\n", \
> + X86_FEATURE_RETPOLINE)
> +
> +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
> +#else /* No retpoline */
> +# define CALL_NOSPEC "call *%[thunk_target]\n"
> +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
> +#endif
> +
> +#endif /* __ASSEMBLY__ */
> +#endif /* __NOSPEC_BRANCH_H__ */
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 372ba3f..7a671d1 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -905,6 +905,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
> setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
> setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
>
> +#ifdef CONFIG_RETPOLINE
> + setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
> +#endif
> +
> fpu__init_system(c);
>
> #ifdef CONFIG_X86_32
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index 457f681..d435c89 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
> lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
> lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
> lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
> +lib-$(CONFIG_RETPOLINE) += retpoline.o
>
> obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
>
> diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
> new file mode 100644
> index 0000000..cb45c6c
> --- /dev/null
> +++ b/arch/x86/lib/retpoline.S
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/stringify.h>
> +#include <linux/linkage.h>
> +#include <asm/dwarf2.h>
> +#include <asm/cpufeatures.h>
> +#include <asm/alternative-asm.h>
> +#include <asm/export.h>
> +#include <asm/nospec-branch.h>
> +
> +.macro THUNK reg
> + .section .text.__x86.indirect_thunk.\reg
> +
> +ENTRY(__x86_indirect_thunk_\reg)
> + CFI_STARTPROC
> + JMP_NOSPEC %\reg
> + CFI_ENDPROC
> +ENDPROC(__x86_indirect_thunk_\reg)
> +.endm
> +
> +/*
> + * Despite being an assembler file we can't just use .irp here
> + * because __KSYM_DEPS__ only uses the C preprocessor and would
> + * only see one instance of "__x86_indirect_thunk_\reg" rather
> + * than one per register with the correct names. So we do it
> + * the simple and nasty way...
> + */
> +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
> +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
> +
> +GENERATE_THUNK(_ASM_AX)
> +GENERATE_THUNK(_ASM_BX)
> +GENERATE_THUNK(_ASM_CX)
> +GENERATE_THUNK(_ASM_DX)
> +GENERATE_THUNK(_ASM_SI)
> +GENERATE_THUNK(_ASM_DI)
> +GENERATE_THUNK(_ASM_BP)
> +GENERATE_THUNK(_ASM_SP)
> +#ifdef CONFIG_64BIT
> +GENERATE_THUNK(r8)
> +GENERATE_THUNK(r9)
> +GENERATE_THUNK(r10)
> +GENERATE_THUNK(r11)
> +GENERATE_THUNK(r12)
> +GENERATE_THUNK(r13)
> +GENERATE_THUNK(r14)
> +GENERATE_THUNK(r15)
> +#endif
>