[PATCH v3 4/7] x86: Rewrite switch_to() code

From: Brian Gerst
Date: Sat Aug 13 2016 - 12:40:06 EST


Move the low-level context switch code to an out-of-line asm stub instead of
using complex inline asm. This allows constructing a new stack frame for the
child process to make it seamlessly flow to ret_from_fork without an extra
test and branch in __switch_to(). It also improves code generation for
__schedule() by using the C calling convention instead of clobbering all
registers.

Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx>
---
arch/x86/entry/entry_32.S | 37 ++++++++++
arch/x86/entry/entry_64.S | 41 ++++++++++-
arch/x86/include/asm/processor.h | 3 -
arch/x86/include/asm/switch_to.h | 137 ++++++-------------------------------
arch/x86/include/asm/thread_info.h | 2 -
arch/x86/kernel/asm-offsets.c | 6 ++
arch/x86/kernel/asm-offsets_32.c | 5 ++
arch/x86/kernel/asm-offsets_64.c | 5 ++
arch/x86/kernel/process_32.c | 9 ++-
arch/x86/kernel/process_64.c | 9 ++-
arch/x86/kernel/smpboot.c | 1 -
11 files changed, 125 insertions(+), 130 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 0b56666..bf8f221 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -204,6 +204,43 @@
POP_GS_EX
.endm

+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in struct inactive_task_frame
+ */
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+
+ /* switch stack */
+ movl %esp, TASK_threadsp(%eax)
+ movl TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movl TASK_stack_canary(%edx), %ebx
+ movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ */
ENTRY(ret_from_fork)
pushl %eax
call schedule_tail
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f6b40e5..c1af8ac 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -368,13 +368,48 @@ END(ptregs_\func)
#include <asm/syscalls_64.h>

/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in inactive_task_frame
+ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* switch stack */
+ movq %rsp, TASK_threadsp(%rdi)
+ movq TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movq TASK_stack_canary(%rsi), %rbx
+ movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
* A newly forked process directly context switches into this address.
*
- * rdi: prev task we switched from
+ * rax: prev task we switched from
*/
ENTRY(ret_from_fork)
- LOCK ; btr $TIF_FORK, TI_flags(%r8)
-
+ movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */

testb $3, CS(%rsp) /* from kernel_thread? */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 63def95..6fee863 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -389,9 +389,6 @@ struct thread_struct {
unsigned short fsindex;
unsigned short gsindex;
#endif
-#ifdef CONFIG_X86_32
- unsigned long ip;
-#endif
#ifdef CONFIG_X86_64
unsigned long fsbase;
unsigned long gsbase;
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 02de86e..bf4e2ec 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -2,135 +2,40 @@
#define _ASM_X86_SWITCH_TO_H

struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+ struct task_struct *next);
+
__visible struct task_struct *__switch_to(struct task_struct *prev,
- struct task_struct *next);
+ struct task_struct *next);
struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);

/* data that is pointed to by thread.sp */
struct inactive_task_frame {
+#ifdef CONFIG_X86_64
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+#else
+ unsigned long si;
+ unsigned long di;
+#endif
+ unsigned long bx;
unsigned long bp;
+ unsigned long ret_addr;
};

-#ifdef CONFIG_X86_32
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movl %P[task_canary](%[next]), %%ebx\n\t" \
- "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam \
- , [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
+struct fork_frame {
+ struct inactive_task_frame frame;
+ struct pt_regs regs;
+};

-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
#define switch_to(prev, next, last) \
do { \
- /* \
- * Context-switching clobbers all registers, so we clobber \
- * them explicitly, via unused output variables. \
- * (EAX and EBP is not listed because EBP is saved/restored \
- * explicitly for wchan access and EAX is the return value of \
- * __switch_to()) \
- */ \
- unsigned long ebx, ecx, edx, esi, edi; \
- \
- asm volatile("pushl %%ebp\n\t" /* save EBP */ \
- "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
- "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
- "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
- "pushl %[next_ip]\n\t" /* restore EIP */ \
- __switch_canary \
- "jmp __switch_to\n" /* regparm call */ \
- "1:\t" \
- "popl %%ebp\n\t" /* restore EBP */ \
- \
- /* output parameters */ \
- : [prev_sp] "=m" (prev->thread.sp), \
- [prev_ip] "=m" (prev->thread.ip), \
- "=a" (last), \
- \
- /* clobbered output registers: */ \
- "=b" (ebx), "=c" (ecx), "=d" (edx), \
- "=S" (esi), "=D" (edi) \
- \
- __switch_canary_oparam \
- \
- /* input parameters: */ \
- : [next_sp] "m" (next->thread.sp), \
- [next_ip] "m" (next->thread.ip), \
- \
- /* regparm parameters for __switch_to(): */ \
- [prev] "a" (prev), \
- [next] "d" (next) \
- \
- __switch_canary_iparam \
- \
- : /* reloaded segment registers */ \
- "memory"); \
+ ((last) = __switch_to_asm((prev), (next))); \
} while (0)

-#else /* CONFIG_X86_32 */
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
-
-#define __EXTRA_CLOBBER \
- , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15", "flags"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movq %P[task_canary](%%rsi),%%r8\n\t" \
- "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam \
- , [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * There is no need to save or restore flags, because flags are always
- * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
- * has no effect.
- */
-#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
- "call __switch_to\n\t" \
- "movq "__percpu_arg([current_task])",%%rsi\n\t" \
- __switch_canary \
- "movq %P[thread_info](%%rsi),%%r8\n\t" \
- "movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
- "jnz ret_from_fork\n\t" \
- RESTORE_CONTEXT \
- : "=a" (last) \
- __switch_canary_oparam \
- : [next] "S" (next), [prev] "D" (prev), \
- [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
- [ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [_tif_fork] "i" (_TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (current_task) \
- __switch_canary_iparam \
- : "memory", "cc" __EXTRA_CLOBBER)
-
-#endif /* CONFIG_X86_32 */
-
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8b7c8d8..494c4b5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,7 +95,6 @@ struct thread_info {
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
-#define TIF_FORK 18 /* ret_from_fork */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
@@ -119,7 +118,6 @@ struct thread_info {
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
-#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_NOHZ (1 << TIF_NOHZ)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2bd5c6f..db3a0af 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -29,6 +29,12 @@

void common(void) {
BLANK();
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
+ BLANK();
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ecdc1d2..880aa09 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -57,6 +57,11 @@ void foo(void)
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));

+#ifdef CONFIG_CC_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d875f97..210927e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,11 @@ int main(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();

+#ifdef CONFIG_CC_STACKPROTECTOR
+ DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ BLANK();
+#endif
+
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d86be29..4bedbc0 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -133,17 +133,20 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
+ struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+ struct inactive_task_frame *frame = &fork_frame->frame;
struct task_struct *tsk;
int err;

- p->thread.sp = (unsigned long) childregs;
+ frame->bp = 0;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.sp0 = (unsigned long) (childregs+1);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- p->thread.ip = (unsigned long) ret_from_kernel_thread;
+ frame->ret_addr = (unsigned long) ret_from_kernel_thread;
task_user_gs(p) = __KERNEL_STACK_CANARY;
childregs->ds = __USER_DS;
childregs->es = __USER_DS;
@@ -161,7 +164,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (sp)
childregs->sp = sp;

- p->thread.ip = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());

p->thread.io_bitmap_ptr = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 63236d8..827eeed 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -141,12 +141,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
{
int err;
struct pt_regs *childregs;
+ struct fork_frame *fork_frame;
+ struct inactive_task_frame *frame;
struct task_struct *me = current;

p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
- p->thread.sp = (unsigned long) childregs;
- set_tsk_thread_flag(p, TIF_FORK);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;

savesegment(gs, p->thread.gsindex);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 067de61..a3fefdf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -935,7 +935,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
per_cpu(cpu_current_top_of_stack, cpu) =
(unsigned long)task_stack_page(idle) + THREAD_SIZE;
#else
- clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
}
--
2.5.5