[tip: x86/urgent] x86: Rewrite ret_from_fork() in C

From: tip-bot2 for Brian Gerst
Date: Mon Jul 10 2023 - 04:15:16 EST


The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 3aec4ecb3d1f313a8ab985df7cab07c4af81f478
Gitweb: https://git.kernel.org/tip/3aec4ecb3d1f313a8ab985df7cab07c4af81f478
Author: Brian Gerst <brgerst@xxxxxxxxx>
AuthorDate: Fri, 23 Jun 2023 18:55:29 -04:00
Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CommitterDate: Mon, 10 Jul 2023 09:52:25 +02:00

x86: Rewrite ret_from_fork() in C

When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function. Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.

Suggested-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Reviewed-by: Kees Cook <keescook@xxxxxxxxxxxx>
Reviewed-by: Sami Tolvanen <samitolvanen@xxxxxxxxxx>
Link: https://lkml.kernel.org/r/20230623225529.34590-3-brgerst@xxxxxxxxx
---
arch/x86/entry/entry_32.S | 30 +++++++---------------------
arch/x86/entry/entry_64.S | 33 +++++++------------------------
arch/x86/include/asm/switch_to.h | 4 +++-
arch/x86/kernel/process.c | 22 ++++++++++++++++++++-
4 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e56123f..6e6af42 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,36 +727,22 @@ SYM_CODE_END(__switch_to_asm)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
+ movl %esp, %edx /* regs */
+
/* return address for the stack unwinder */
pushl $.Lsyscall_32_done

FRAME_BEGIN
- pushl %eax
- call schedule_tail
+ /* prev already in EAX */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
addl $4, %esp
FRAME_END

- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
-
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- leal 4(%esp), %eax
- call syscall_exit_to_user_mode
RET
-
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286..91f6818 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
- __FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
- UNWIND_HINT_END_OF_STACK
+SYM_CODE_START(ret_from_fork_asm)
+ UNWIND_HINT_REGS
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */

- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ call ret_from_fork

-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
jmp swapgs_restore_regs_and_return_to_usermode
-
-1:
- /* kernel thread */
- UNWIND_HINT_END_OF_STACK
- movq %r12, %rdi
- CALL_NOSPEC rbx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305..f42dbf1 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);

-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);

/*
* This is the structure pointed to by thread.sp for an inactive task. The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff9b80a..72015db 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
@@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}

+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
+{
+ schedule_tail(prev);
+
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
+ */
+ regs->ax = 0;
+ }
+
+ syscall_exit_to_user_mode(regs);
+}
+
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame = &fork_frame->frame;

frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
p->thread.iopl_warn = 0;