[PATCH 3/5] x86/entry_64.S: use PUSH insns to build pt_regs on stack

From: Denys Vlasenko
Date: Thu Mar 19 2015 - 13:18:37 EST


With this change, on SYSCALL64 code path we are now populating
pt_regs->cs, pt_regs->ss and pt_regs->rcx unconditionally and
therefore don't need to do that in FIXUP_TOP_OF_STACK.

We lose a number of large insns there:

text data bss dec hex filename
13298 0 0 13298 33f2 entry_64_before.o
12978 0 0 12978 32b2 entry_64.o

What's more important, we convert two "MOVQ $imm,off(%rsp)" to "PUSH $imm"
(the ones which fill pt_regs->cs,ss).

Before this patch, placing them on fast path was slowing it down by two cycles:
this form of MOV is very large, 12 bytes, and this probably reduces decode bandwidth
to one insn per cycle when CPU sees them.

Therefore they were living in FIXUP_TOP_OF_STACK instead (away from fast path).

"PUSH $imm" is a small 2-byte insn. Moving it to fast path does not slow it down
in my measurements.

Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
CC: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
CC: Steven Rostedt <rostedt@xxxxxxxxxxx>
CC: Ingo Molnar <mingo@xxxxxxxxxx>
CC: Borislav Petkov <bp@xxxxxxxxx>
CC: "H. Peter Anvin" <hpa@xxxxxxxxx>
CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
CC: Oleg Nesterov <oleg@xxxxxxxxxx>
CC: Frederic Weisbecker <fweisbec@xxxxxxxxx>
CC: Alexei Starovoitov <ast@xxxxxxxxxxxx>
CC: Will Drewry <wad@xxxxxxxxxxxx>
CC: Kees Cook <keescook@xxxxxxxxxxxx>
CC: x86@xxxxxxxxxx
CC: linux-kernel@xxxxxxxxxxxxxxx
---

Changes since last version: reformulate old comment which was
mostly failing to explain why we don't have TRACE_IRQS_OFF/ONs
around a irq-off section in SYSCALL64 code path.

arch/x86/kernel/entry_64.S | 54 +++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ecb68d8..e9c1882 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -126,11 +126,8 @@ ENDPROC(native_usergs_sysret64)
* manipulation.
*/
.macro FIXUP_TOP_OF_STACK tmp offset=0
- movq $__USER_DS,SS+\offset(%rsp)
- movq $__USER_CS,CS+\offset(%rsp)
- movq RIP+\offset(%rsp),\tmp /* get rip */
- movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
- movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */
+ /* copy flags to r11 as sysret would do */
+ movq EFLAGS+\offset(%rsp),\tmp
movq \tmp,R11+\offset(%rsp)
.endm

@@ -214,7 +211,6 @@ ENDPROC(native_usergs_sysret64)
* r9 arg5
* (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
*
- * Interrupts are off on entry.
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
@@ -228,6 +224,12 @@ ENTRY(system_call)
CFI_DEF_CFA rsp,0
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
+
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
SWAPGS_UNSAFE_STACK
/*
* A hypervisor implementation might want to use a label
@@ -236,27 +238,35 @@ ENTRY(system_call)
*/
GLOBAL(system_call_after_swapgs)

- /*
- * We use 'rsp_scratch' as a scratch register, hence this block must execute
- * atomically in the face of possible interrupt-driven task preemption,
- * so we can enable interrupts only after we're done with using rsp_scratch:
- */
movq %rsp,PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack),%rsp
- ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
- movq %rcx,RIP(%rsp)
- movq PER_CPU_VAR(rsp_scratch),%rcx
- movq %r11,EFLAGS(%rsp)
- movq %rcx,RSP(%rsp)
+
+ /* Construct struct pt_regs on stack */
+ pushq_cfi $__USER_DS /* pt_regs->ss */
+ pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
/*
- * No need to follow this irqs off/on section - it's straight
- * and short:
+ * Re-enable interrupts.
+ * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+ * must execute atomically in the face of possible interrupt-driven
+ * task preemption. We must enable interrupts only after we're done
+ * with using rsp_scratch:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- movq_cfi rax,ORIG_RAX
- SAVE_C_REGS_EXCEPT_RAX_RCX_R11
- movq $-ENOSYS,RAX(%rsp)
- CFI_REL_OFFSET rip,RIP
+ pushq_cfi %r11 /* pt_regs->flags */
+ pushq_cfi $__USER_CS /* pt_regs->cs */
+ pushq_cfi %rcx /* pt_regs->ip */
+ CFI_REL_OFFSET rip,0
+ pushq_cfi_reg rax /* pt_regs->orig_ax */
+ pushq_cfi_reg rdi /* pt_regs->di */
+ pushq_cfi_reg rsi /* pt_regs->si */
+ pushq_cfi_reg rdx /* pt_regs->dx */
+ pushq_cfi_reg rcx /* pt_regs->cx */
+ pushq_cfi $-ENOSYS /* pt_regs->ax */
+ pushq_cfi_reg r8 /* pt_regs->r8 */
+ pushq_cfi_reg r9 /* pt_regs->r9 */
+ pushq_cfi_reg r10 /* pt_regs->r10 */
+ sub $(7*8),%rsp /* pt_regs->r11,bp,bx,r12-15 not saved */
+
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
jnz tracesys
system_call_fastpath:
--
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/