[PATCH] x86, asm: Document some of the syscall asm glue

From: Borislav Petkov
Date: Wed Aug 24 2011 - 08:44:36 EST


On Tue, Aug 23, 2011 at 06:33:17PM +0100, Al Viro wrote:
> * asm glue is subtle, evil and doesn't have anywhere near enough
> documentation ;-/

I took the liberty to document some of your asm glue analysis in an
attempt to make the code a bit more understandable. How about the
following:

--
From: Borislav Petkov <borislav.petkov@xxxxxxx>
Date: Wed, 24 Aug 2011 14:30:43 +0200
Subject: [PATCH] x86, asm: Document some of the syscall asm glue

Document some of the asm glue around compat SYSCALL32 and do a
whitespace cleanup while at it. See linked thread below for further
reference.

Link: http://lkml.kernel.org/r/20110820011845.GC2203@xxxxxxxxxxxxxxxxxx
Signed-off-by: Borislav Petkov <borislav.petkov@xxxxxxx>
---
arch/x86/ia32/ia32entry.S | 138 ++++++++++++++++++++++++++-----------------
arch/x86/kernel/entry_64.S | 19 +++++-
2 files changed, 98 insertions(+), 59 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..8254432 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -1,16 +1,16 @@
/*
- * Compatibility mode system call entry point for x86-64.
- *
+ * Compatibility mode system call entry point for x86-64.
+ *
* Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */
+ */

#include <asm/dwarf2.h>
#include <asm/calling.h>
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
-#include <asm/thread_info.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
#include <linux/linkage.h>
@@ -38,11 +38,11 @@
xchg %ecx,%esi
movl %ebx,%edi
movl %edx,%edx /* zero extension */
- .endm
+ .endm

- /* clobbers %eax */
+ /* clobbers %eax */
.macro CLEAR_RREGS offset=0, _r9=rax
- xorl %eax,%eax
+ xorl %eax,%eax
movq %rax,\offset+R11(%rsp)
movq %rax,\offset+R10(%rsp)
movq %\_r9,\offset+R9(%rsp)
@@ -69,7 +69,7 @@
movl \offset+64(%rsp),%edi
movl %eax,%eax /* zero extension */
.endm
-
+
.macro CFI_STARTPROC32 simple
CFI_STARTPROC \simple
CFI_UNDEFINED r8
@@ -106,14 +106,14 @@ ENDPROC(native_irq_enable_sysexit)
* %esi Arg4
* %edi Arg5
* %ebp user stack
- * 0(%ebp) Arg6
- *
+ * 0(%ebp) Arg6
+ *
* Interrupts off.
- *
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
* path below. Set up a complete hardware stack frame to share code
* with the int 0x80 path.
- */
+ */
ENTRY(ia32_sysenter_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
@@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target)
* disabled irqs, here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- movl %ebp,%ebp /* zero extension */
+ movl %ebp,%ebp /* zero extension */
pushq_cfi $__USER32_DS
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rbp
@@ -144,12 +144,12 @@ ENTRY(ia32_sysenter_target)
pushq_cfi %rax
cld
SAVE_ARGS 0,1,0
- /* no need to do an access_ok check here because rbp has been
- 32bit zero extended */
+ /* no need to do an access_ok check here because rbp has been
+ 32bit zero extended */
1: movl (%rbp),%ebp
- .section __ex_table,"a"
- .quad 1b,ia32_badarg
- .previous
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -170,7 +170,7 @@ sysenter_dispatch:
sysexit_from_sys_call:
andl $~TS_COMPAT,TI_status(%r10)
/* clear IF, that popfq doesn't enable interrupts early */
- andl $~0x200,EFLAGS-R11(%rsp)
+ andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
RESTORE_ARGS 0,24,0,0,0,0
@@ -260,20 +260,21 @@ ENDPROC(ia32_sysenter_target)
* Arguments:
* %eax System call number.
* %ebx Arg1
- * %ecx return EIP
+ * %ecx return EIP
* %edx Arg3
* %esi Arg4
* %edi Arg5
- * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
- * %esp user stack
+ * %ebp Arg2 [note: not saved in the stack frame, should not be touched
+ * because it is callee-saved in 64-bit calling convention]
+ * %esp user stack
* 0(%esp) Arg6
- *
+ *
* Interrupts off.
- *
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
* path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
+ * with the int 0x80 path.
+ */
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
@@ -281,34 +282,57 @@ ENTRY(ia32_cstar_target)
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
+
+ /* stash away usermode stack ptr */
movl %esp,%r8d
CFI_REGISTER rsp,r8
movq PER_CPU_VAR(kernel_stack),%rsp
+
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,0,0
- movl %eax,%eax /* zero extension */
+ movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+
+ /* return-RIP is in %ecx when executing SYSCALL */
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
- movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+
+ /*
+ * Put Arg2 into %rcx pt_regs slot to match kernel syscall
+ * calling conventions, i.e. what INT80 would expect;
+ * this lies slightly to ptrace
+ */
+ movq %rbp,RCX-ARGOFFSET(%rsp)
movl %ebp,%ecx
movq $__USER32_CS,CS-ARGOFFSET(%rsp)
movq $__USER32_DS,SS-ARGOFFSET(%rsp)
+
+ /* rFLAGS is in %r11 when executing SYSCALL */
movq %r11,EFLAGS-ARGOFFSET(%rsp)
/*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- movq %r8,RSP-ARGOFFSET(%rsp)
+
+ /* save usermode stack ptr into pt_regs */
+ movq %r8,RSP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rsp,RSP-ARGOFFSET
- /* no need to do an access_ok check here because r8 has been
- 32bit zero extended */
- /* hardware stack frame is complete now */
+
+ /*
+ * Get Arg6 which is on the usermode stack; no need to do an
+ * access_ok check here because %r8 has been 32bit zero extended.
+ * hardware stack frame is complete now.
+ */
1: movl (%r8),%r9d
+
+ /*
+ * handle pagefaulting when accessing usermode stack by returning
+ * -EFAULT
+ */
.section __ex_table,"a"
.quad 1b,ia32_badarg
- .previous
+ .previous
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -331,7 +355,7 @@ sysretl_from_sys_call:
RESTORE_ARGS 0,-ARG_SKIP,0,0,0
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
- movl EFLAGS-ARGOFFSET(%rsp),%r11d
+ movl EFLAGS-ARGOFFSET(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
xorq %r10,%r10
xorq %r9,%r9
@@ -340,7 +364,7 @@ sysretl_from_sys_call:
movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp
USERGS_SYSRET32
-
+
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
CFI_RESTORE_STATE
@@ -358,6 +382,8 @@ cstar_tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
jz cstar_auditsys
#endif
+
+ /* put Arg6 into %ebp where ptrace expects it */
xchgl %r9d,%ebp
SAVE_REST
CLEAR_RREGS 0, r9
@@ -366,21 +392,23 @@ cstar_tracesys:
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
RESTORE_REST
+
+ /* sync back Arg6's possibly changed value where it is expected by C */
xchgl %ebp,%r9d
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
jmp cstar_do_call
END(ia32_cstar_target)
-
+
ia32_badarg:
movq $-EFAULT,%rax
jmp ia32_sysret
CFI_ENDPROC

-/*
- * Emulated IA32 system calls via int 0x80.
+/*
+ * Emulated IA32 system calls via int 0x80.
*
- * Arguments:
+ * Arguments:
* %eax System call number.
* %ebx Arg1
* %ecx Arg2
@@ -390,13 +418,13 @@ ia32_badarg:
* %ebp Arg6 [note: not saved in the stack frame, should not be touched]
*
* Notes:
- * Uses the same stack frame as the x86-64 version.
+ * Uses the same stack frame as the x86-64 version.
* All registers except %eax must be saved (but ptrace may violate that)
* Arguments are zero extended. For system calls that want sign extension and
* take long arguments a wrapper is needed. Most calls can just be called
* directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
+ * Assumes it is only called from user space and entered with interrupts off.
+ */

ENTRY(ia32_syscall)
CFI_STARTPROC32 simple
@@ -433,9 +461,9 @@ ia32_sysret:
movq %rax,RAX-ARGOFFSET(%rsp)
ia32_ret_from_sys_call:
CLEAR_RREGS -ARGOFFSET
- jmp int_ret_from_sys_call
+ jmp int_ret_from_sys_call

-ia32_tracesys:
+ia32_tracesys:
SAVE_REST
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
@@ -457,13 +485,13 @@ quiet_ni_syscall:
movq $-ENOSYS,%rax
ret
CFI_ENDPROC
-
+
.macro PTREGSCALL label, func, arg
.globl \label
\label:
leaq \func(%rip),%rax
leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- jmp ia32_ptregs_common
+ jmp ia32_ptregs_common
.endm

CFI_STARTPROC32
@@ -537,7 +565,7 @@ ia32_sys_call_table:
.quad quiet_ni_syscall /* old stty syscall holder */
.quad quiet_ni_syscall /* old gtty syscall holder */
.quad sys_access
- .quad sys_nice
+ .quad sys_nice
.quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
.quad sys_sync
.quad sys32_kill
@@ -616,7 +644,7 @@ ia32_sys_call_table:
.quad stub32_iopl /* 110 */
.quad sys_vhangup
.quad quiet_ni_syscall /* old "idle" system call */
- .quad sys32_vm86_warning /* vm86old */
+ .quad sys32_vm86_warning /* vm86old */
.quad compat_sys_wait4
.quad sys_swapoff /* 115 */
.quad compat_sys_sysinfo
@@ -669,7 +697,7 @@ ia32_sys_call_table:
.quad sys_mremap
.quad sys_setresuid16
.quad sys_getresuid16 /* 165 */
- .quad sys32_vm86_warning /* vm86 */
+ .quad sys32_vm86_warning /* vm86 */
.quad quiet_ni_syscall /* query_module */
.quad sys_poll
.quad compat_sys_nfsservctl
@@ -724,10 +752,10 @@ ia32_sys_call_table:
.quad sys_mincore
.quad sys_madvise
.quad compat_sys_getdents64 /* 220 getdents64 */
- .quad compat_sys_fcntl64
+ .quad compat_sys_fcntl64
.quad quiet_ni_syscall /* tux */
- .quad quiet_ni_syscall /* security */
- .quad sys_gettid
+ .quad quiet_ni_syscall /* security */
+ .quad sys_gettid
.quad sys32_readahead /* 225 */
.quad sys_setxattr
.quad sys_lsetxattr
@@ -742,7 +770,7 @@ ia32_sys_call_table:
.quad sys_lremovexattr
.quad sys_fremovexattr
.quad sys_tkill
- .quad sys_sendfile64
+ .quad sys_sendfile64
.quad compat_sys_futex /* 240 */
.quad compat_sys_sched_setaffinity
.quad compat_sys_sched_getaffinity
@@ -754,7 +782,7 @@ ia32_sys_call_table:
.quad compat_sys_io_submit
.quad sys_io_cancel
.quad sys32_fadvise64 /* 250 */
- .quad quiet_ni_syscall /* free_huge_pages */
+ .quad quiet_ni_syscall /* free_huge_pages */
.quad sys_exit_group
.quad sys32_lookup_dcookie
.quad sys_epoll_create
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6419bb0..9569f11 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -607,10 +607,16 @@ tracesys:
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
+
+ /*
+ * check the Requestor Privilege Level of the CS selector
+ * previously pushed on the stack. If 0, we're returning
+ * to kernel space.
+ */
testl $3,CS-ARGOFFSET(%rsp)
je retint_restore_args
- movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
+ movl $_TIF_ALLWORK_MASK,%edi
GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
@@ -618,11 +624,16 @@ GLOBAL(int_with_check)
andl %edi,%edx
jnz int_careful
andl $~TS_COMPAT,TI_status(%rcx)
+
+ /* no work pending, return to userspace */
jmp retint_swapgs

- /* Either reschedule or signal or syscall exit tracking needed. */
- /* First do a reschedule test. */
- /* edx: work, edi: workmask */
+ /*
+ * Either reschedule or signal or syscall exit tracking
+ * needed. First do a reschedule test.
+ *
+ * edx: work, edi: workmask
+ */
int_careful:
bt $TIF_NEED_RESCHED,%edx
jnc int_very_careful
--
1.7.4


--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/