Re: 8aeb879baf12 - significant system call latency regression, bisected
From: H. Peter Anvin
Date: Fri Jun 19 2026 - 06:24:32 EST
On June 19, 2026 1:14:27 AM PDT, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>On Wed, Jun 17, 2026 at 02:37:18PM +0200, Peter Zijlstra wrote:
>> - makes -fno-jump-tables unconditional
>> - removes array_index_nospec() from the syscall dispatch
>
>FWIW, this also allows making all SYSCALLs __noendbr, very much
>including the 'legacy' sys_call_table :-)
>
>Compile tested with IA32_EMULATION=n and reliably yields:
>
>vmlinux.o: warning: objtool: sys_call_table+0x0: data relocation to !ENDBR: __x64_sys_read+0x0
>...
>vmlinux.o: warning: objtool: sys_call_table+0xeb0: data relocation to !ENDBR: __x64_sys_listns+0x0
>
>(which is just one little objtool patch away from being fixed)
>
>and boots fine (in kvm).
>
>---
>diff --git a/arch/x86/Makefile b/arch/x86/Makefile
>index 598f178102ee..b154a2a20eb2 100644
>--- a/arch/x86/Makefile
>+++ b/arch/x86/Makefile
>@@ -90,17 +90,8 @@ CC_FLAGS_FPU += -mhard-float
> endif
>
> ifeq ($(CONFIG_X86_KERNEL_IBT),y)
>-#
>-# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
>-# NOTRACK prefixes. Current generation compilers unconditionally employ NOTRACK
>-# for jump-tables, as such, disable jump-tables for now.
>-#
>-# (jump-tables are implicitly disabled by RETPOLINE)
>-#
>-# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
>-#
>-KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
>-KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
>+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch)
>+KBUILD_RUSTFLAGS += -Zcf-protection=branch
> else
> KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
> endif
>@@ -173,6 +164,13 @@ endif
> KBUILD_RUSTFLAGS += -Ccode-model=kernel
>
> percpu_seg := gs
>+
>+ # Due to retpolines and cf-protection=branch's implicit NOTRACK usage
>+ # for jump-tables, blanked disable jump-tables for all x86_64 builds to
>+ # get a consistent behaviour across configurations. This allows
>+ # removing some array_index_nospec() usage.
>+ KBUILD_CFLAGS += -fno-jump-tables
>+ KBUILD_RISTFLAGS += $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
> endif
>
> ifeq ($(CONFIG_STACKPROTECTOR),y)
>@@ -209,15 +207,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
> ifdef CONFIG_MITIGATION_RETPOLINE
> KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
> KBUILD_RUSTFLAGS += $(RETPOLINE_RUSTFLAGS)
>- # Additionally, avoid generating expensive indirect jumps which
>- # are subject to retpolines for small number of switch cases.
>- # LLVM turns off jump table generation by default when under
>- # retpoline builds, however, gcc does not for x86. This has
>- # only been fixed starting from gcc stable version 8.4.0 and
>- # onwards, but not for older ones. See gcc bug #86952.
>- ifndef CONFIG_CC_IS_CLANG
>- KBUILD_CFLAGS += -fno-jump-tables
>- endif
> endif
>
> ifdef CONFIG_MITIGATION_SLS
>diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
>index 71f032504e73..7e87947e12be 100644
>--- a/arch/x86/entry/syscall_64.c
>+++ b/arch/x86/entry/syscall_64.c
>@@ -8,9 +8,10 @@
> #include <linux/entry-common.h>
> #include <linux/nospec.h>
> #include <asm/syscall.h>
>+#include <asm/ibt.h>
>
>-#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
>-#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
>+#define __SYSCALL(nr, sym) extern __noendbr long __x64_##sym(const struct pt_regs *);
>+#define __SYSCALL_NORETURN(nr, sym) extern __noendbr long __noreturn __x64_##sym(const struct pt_regs *);
> #include <asm/syscalls_64.h>
> #ifdef CONFIG_X86_X32_ABI
> #include <asm/syscalls_x32.h>
>@@ -25,30 +26,47 @@
> * kernel/trace/trace_syscalls.c still wants to know the system
> * call address.
> */
>-#define __SYSCALL(nr, sym) __x64_##sym,
>+#define __SYSCALL(nr, sym) (void *)&__x64_##sym,
> const sys_call_ptr_t sys_call_table[] = {
> #include <asm/syscalls_64.h>
> };
> #undef __SYSCALL
>
> #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
>-long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
>+static noinstr long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
> {
>+ /*
>+ * Because -fno-jump-tables, this compiles into a binary branch tree
>+ * rather than a jump-table. As such @nr is not used as an array
>+ * index. Additionally, this is an out-of-line function on purpose,
>+ * such that all the actual syscall function calls are tail-calls,
>+ * returning to our caller for the common bits.
>+ */
>+ instrumentation_begin();
> switch (nr) {
> #include <asm/syscalls_64.h>
> default: return __x64_sys_ni_syscall(regs);
> }
>+ instrumentation_end();
> }
>
> #ifdef CONFIG_X86_X32_ABI
>-long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
>+static noinstr long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
> {
>+ instrumentation_begin();
> switch (nr) {
> #include <asm/syscalls_x32.h>
> default: return __x64_sys_ni_syscall(regs);
> }
>+ instrumentation_end();
>+}
>+#else
>+static __always_inline long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
>+{
>+ return __x64_sys_ni_syscall(regs);
> }
> #endif
>+#undef __SYSCALL
>
> static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
> {
>@@ -59,7 +77,6 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
> unsigned int unr = nr;
>
> if (likely(unr < NR_syscalls)) {
>- unr = array_index_nospec(unr, NR_syscalls);
> regs->ax = x64_sys_call(regs, unr);
> return true;
> }
>@@ -76,7 +93,6 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
> unsigned int xnr = nr - __X32_SYSCALL_BIT;
>
> if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
>- xnr = array_index_nospec(xnr, X32_NR_syscalls);
> regs->ax = x32_sys_call(regs, xnr);
> return true;
> }
>@@ -84,7 +100,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
> }
>
> /* Returns true to return using SYSRET, or false to use IRET */
>-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
>+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
> {
> nr = syscall_enter_from_user_mode(regs, nr);
>
>diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
>index 7e88705e907f..1639fbc02680 100644
>--- a/arch/x86/include/asm/syscall_wrapper.h
>+++ b/arch/x86/include/asm/syscall_wrapper.h
>@@ -7,9 +7,10 @@
> #define _ASM_X86_SYSCALL_WRAPPER_H
>
> #include <asm/ptrace.h>
>+#include <asm/ibt.h>
>
>-extern long __x64_sys_ni_syscall(const struct pt_regs *regs);
>-extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
>+extern __noendbr long __x64_sys_ni_syscall(const struct pt_regs *regs);
>+extern __noendbr long __ia32_sys_ni_syscall(const struct pt_regs *regs);
>
> /*
> * Instead of the generic __SYSCALL_DEFINEx() definition, the x86 version takes
>@@ -83,15 +84,15 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
> __MAP(x, __SC_TYPE, __VA_ARGS__)) \
>
> #define __SYS_STUB0(abi, name) \
>- long __##abi##_##name(const struct pt_regs *regs); \
>+ long __noendbr __##abi##_##name(const struct pt_regs *regs); \
> ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
>- long __##abi##_##name(const struct pt_regs *regs) \
>+ long __noendbr __##abi##_##name(const struct pt_regs *regs) \
> __alias(__do_##name);
>
> #define __SYS_STUBx(abi, name, ...) \
>- long __##abi##_##name(const struct pt_regs *regs); \
>+ long __noendbr __##abi##_##name(const struct pt_regs *regs); \
> ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
>- long __##abi##_##name(const struct pt_regs *regs) \
>+ long __noendbr __##abi##_##name(const struct pt_regs *regs) \
> { \
> return __se_##name(__VA_ARGS__); \
> }
>@@ -257,8 +258,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
> * For VSYSCALLS, we need to declare these three syscalls with the new
> * pt_regs-based calling convention for in-kernel use.
> */
>-long __x64_sys_getcpu(const struct pt_regs *regs);
>-long __x64_sys_gettimeofday(const struct pt_regs *regs);
>-long __x64_sys_time(const struct pt_regs *regs);
>+long __noendbr __x64_sys_getcpu(const struct pt_regs *regs);
>+long __noendbr __x64_sys_gettimeofday(const struct pt_regs *regs);
>+long __noendbr __x64_sys_time(const struct pt_regs *regs);
>
> #endif /* _ASM_X86_SYSCALL_WRAPPER_H */
*Very* nice indeed.
I was definitely hoping this would be the next step.