[PATCH v4 3/9] x86/entry: Convert ret_from_fork to C

From: Andy Lutomirski
Date: Wed Mar 17 2021 - 14:13:53 EST


ret_from_fork is written in asm, slightly differently, for x86_32 and
x86_64. Convert it to C.

This is a straight conversion without any particular cleverness. As a
further cleanup, the code that sets up the ret_from_fork argument registers
could be adjusted to put the arguments in the correct registers.

This will cause the ORC unwinder to find pt_regs even for kernel threads on
x86_64. This seems harmless.

The 32-bit comment above the now-deleted schedule_tail_wrapper was
obsolete: the encode_frame_pointer mechanism (see copy_thread()) solves the
same problem more cleanly.

Cc: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
arch/x86/entry/common.c | 23 ++++++++++++++
arch/x86/entry/entry_32.S | 51 +++++---------------------------
arch/x86/entry/entry_64.S | 33 +++++----------------
arch/x86/include/asm/switch_to.h | 2 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/unwind_orc.c | 2 +-
7 files changed, 43 insertions(+), 72 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 95776f16c1cb..ef1c65938a6b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -214,6 +214,29 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}

+void ret_from_fork(struct task_struct *prev,
+ int (*kernel_thread_fn)(void *),
+ void *kernel_thread_arg,
+ struct pt_regs *user_regs);
+
+__visible void noinstr ret_from_fork(struct task_struct *prev,
+ int (*kernel_thread_fn)(void *),
+ void *kernel_thread_arg,
+ struct pt_regs *user_regs)
+{
+ instrumentation_begin();
+
+ schedule_tail(prev);
+
+ if (kernel_thread_fn) {
+ kernel_thread_fn(kernel_thread_arg);
+ user_regs->ax = 0;
+ }
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(user_regs);
+}
+
#ifdef CONFIG_XEN_PV
#ifndef CONFIG_PREEMPTION
/*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..7113d259727f 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -805,26 +805,6 @@ SYM_CODE_START(__switch_to_asm)
SYM_CODE_END(__switch_to_asm)
.popsection

-/*
- * The unwinder expects the last frame on the stack to always be at the same
- * offset from the end of the page, which allows it to validate the stack.
- * Calling schedule_tail() directly would break that convention because its an
- * asmlinkage function so its argument has to be pushed on the stack. This
- * wrapper creates a proper "end of stack" frame header before the call.
- */
-.pushsection .text, "ax"
-SYM_FUNC_START(schedule_tail_wrapper)
- FRAME_BEGIN
-
- pushl %eax
- call schedule_tail
- popl %eax
-
- FRAME_END
- ret
-SYM_FUNC_END(schedule_tail_wrapper)
-.popsection
-
/*
* A newly forked process directly context switches into this address.
*
@@ -833,29 +813,14 @@ SYM_FUNC_END(schedule_tail_wrapper)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
- call schedule_tail_wrapper
-
- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
-
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- movl %esp, %eax
- call syscall_exit_to_user_mode
- jmp .Lsyscall_32_done
-
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_START(asm_ret_from_fork)
+ movl %ebx, %edx
+ movl %edi, %ecx
+ pushl %esp
+ call ret_from_fork
+ addl $4, %esp
+ jmp .Lsyscall_32_done
+SYM_CODE_END(asm_ret_from_fork)
.popsection

SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..0f7df8861ac1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -273,35 +273,18 @@ SYM_FUNC_END(__switch_to_asm)
* rax: prev task we switched from
* rbx: kernel thread func (NULL for user thread)
* r12: kernel thread arg
+ * rbp: encoded frame pointer for the fp unwinder
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
- UNWIND_HINT_EMPTY
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */
-
- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
-
-2:
+SYM_CODE_START(asm_ret_from_fork)
UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
+ movq %rax, %rdi
+ movq %rbx, %rsi
+ movq %r12, %rdx
+ movq %rsp, %rcx
+ call ret_from_fork
jmp swapgs_restore_regs_and_return_to_usermode
-
-1:
- /* kernel thread */
- UNWIND_HINT_EMPTY
- movq %r12, %rdi
- CALL_NOSPEC rbx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(asm_ret_from_fork)
.popsection

.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..fcb9b02a1269 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,7 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);

-asmlinkage void ret_from_fork(void);
+asmlinkage void asm_ret_from_fork(void);

/*
* This is the structure pointed to by thread.sp for an inactive task. The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f6f16df04cb9..34efbca08738 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -135,7 +135,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame = &fork_frame->frame;

frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) asm_ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4f2f54e1281c..bf8aa15ac652 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -151,7 +151,7 @@ EXPORT_SYMBOL_GPL(start_thread);
* more flexibility.
*
* The return value (in %ax) will be the "prev" task after
- * the task-switch, and shows up in ret_from_fork in entry.S,
+ * the task-switch, and shows up in asm_ret_from_fork in entry_32.S,
* for example.
*/
__visible __notrace_funcgraph struct task_struct *
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f800100066..c6e7235c6d9f 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -659,7 +659,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
state->sp = task->thread.sp + sizeof(*frame);
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
- state->signal = (void *)state->ip == ret_from_fork;
+ state->signal = (void *)state->ip == asm_ret_from_fork;
}

if (get_stack_info((unsigned long *)state->sp, state->task,
--
2.30.2