x86-64: Maintain 16-byte stack alignment

From: Herbert Xu
Date: Tue Jan 10 2017 - 09:34:06 EST


I recently applied the patch

https://patchwork.kernel.org/patch/9468391/

and ended up with a boot crash when it tried to run the x86 chacha20
code. It turned out that the patch changed a manually aligned
stack buffer to one that is aligned by gcc. What was happening was
that gcc can stack align to any value on x86-64 except 16. The
reason is that gcc assumes that the stack is always 16-byte aligned,
which is not actually the case in the kernel.

The x86-64 CPU actually tries to keep the stack 16-byte aligned,
e.g., it'll do so when an IRQ comes in. So the reason it doesn't
work in the kernel mostly comes down to the fact that the struct
pt_regs which lives near the top of the stack is 168 bytes which
is not a multiple of 16.

This patch tries to fix this by adding an 8-byte padding at the
top of the call-chain involving pt_regs so that when we call a C
function later we do so with an aligned stack.

The same problem probably exists on i386 too since gcc also assumes
16-byte alignment there. It's harder to fix however as the CPU
doesn't help us in the IRQ case.

Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d3..29d3bcb 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -59,39 +59,42 @@
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
+ *
+ * Note we add 8 extra bytes at the beginning to preserve stack alignment.
*/
-#define R15 0*8
-#define R14 1*8
-#define R13 2*8
-#define R12 3*8
-#define RBP 4*8
-#define RBX 5*8
+#define R15 1*8
+#define R14 2*8
+#define R13 3*8
+#define R12 4*8
+#define RBP 5*8
+#define RBX 6*8
/* These regs are callee-clobbered. Always saved on kernel entry. */
-#define R11 6*8
-#define R10 7*8
-#define R9 8*8
-#define R8 9*8
-#define RAX 10*8
-#define RCX 11*8
-#define RDX 12*8
-#define RSI 13*8
-#define RDI 14*8
+#define R11 7*8
+#define R10 8*8
+#define R9 9*8
+#define R8 10*8
+#define RAX 11*8
+#define RCX 12*8
+#define RDX 13*8
+#define RSI 14*8
+#define RDI 15*8
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
-#define ORIG_RAX 15*8
+#define ORIG_RAX 16*8
/* Return frame for iretq */
-#define RIP 16*8
-#define CS 17*8
-#define EFLAGS 18*8
-#define RSP 19*8
-#define SS 20*8
+#define RIP 17*8
+#define CS 18*8
+#define EFLAGS 19*8
+#define RSP 20*8
+#define SS 21*8

+/* Note that this excludes the 8-byte padding. */
#define SIZEOF_PTREGS 21*8

.macro ALLOC_PT_GPREGS_ON_STACK
- addq $-(15*8), %rsp
+ addq $-(16*8), %rsp
.endm

.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
@@ -114,7 +117,7 @@
movq %rdi, 14*8+\offset(%rsp)
.endm
.macro SAVE_C_REGS offset=0
- SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
+ SAVE_C_REGS_HELPER 8+\offset, 1, 1, 1, 1
.endm
.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
@@ -130,43 +133,43 @@
.endm

.macro SAVE_EXTRA_REGS offset=0
- movq %r15, 0*8+\offset(%rsp)
- movq %r14, 1*8+\offset(%rsp)
- movq %r13, 2*8+\offset(%rsp)
- movq %r12, 3*8+\offset(%rsp)
- movq %rbp, 4*8+\offset(%rsp)
- movq %rbx, 5*8+\offset(%rsp)
+ movq %r15, 1*8+\offset(%rsp)
+ movq %r14, 2*8+\offset(%rsp)
+ movq %r13, 3*8+\offset(%rsp)
+ movq %r12, 4*8+\offset(%rsp)
+ movq %rbp, 5*8+\offset(%rsp)
+ movq %rbx, 6*8+\offset(%rsp)
.endm

.macro RESTORE_EXTRA_REGS offset=0
- movq 0*8+\offset(%rsp), %r15
- movq 1*8+\offset(%rsp), %r14
- movq 2*8+\offset(%rsp), %r13
- movq 3*8+\offset(%rsp), %r12
- movq 4*8+\offset(%rsp), %rbp
- movq 5*8+\offset(%rsp), %rbx
+ movq 1*8+\offset(%rsp), %r15
+ movq 2*8+\offset(%rsp), %r14
+ movq 3*8+\offset(%rsp), %r13
+ movq 4*8+\offset(%rsp), %r12
+ movq 5*8+\offset(%rsp), %rbp
+ movq 6*8+\offset(%rsp), %rbx
.endm

.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
- movq 6*8(%rsp), %r11
+ movq 7*8(%rsp), %r11
.endif
.if \rstor_r8910
- movq 7*8(%rsp), %r10
- movq 8*8(%rsp), %r9
- movq 9*8(%rsp), %r8
+ movq 8*8(%rsp), %r10
+ movq 9*8(%rsp), %r9
+ movq 10*8(%rsp), %r8
.endif
.if \rstor_rax
- movq 10*8(%rsp), %rax
+ movq 11*8(%rsp), %rax
.endif
.if \rstor_rcx
- movq 11*8(%rsp), %rcx
+ movq 12*8(%rsp), %rcx
.endif
.if \rstor_rdx
- movq 12*8(%rsp), %rdx
+ movq 13*8(%rsp), %rdx
.endif
- movq 13*8(%rsp), %rsi
- movq 14*8(%rsp), %rdi
+ movq 14*8(%rsp), %rsi
+ movq 15*8(%rsp), %rdi
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
@@ -185,7 +188,7 @@
.endm

.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
- subq $-(15*8+\addskip), %rsp
+ subq $-(16*8+\addskip), %rsp
.endm

.macro icebp
@@ -203,11 +206,7 @@
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
#ifdef CONFIG_FRAME_POINTER
- .if \ptregs_offset
- leaq \ptregs_offset(%rsp), %rbp
- .else
- mov %rsp, %rbp
- .endif
+ leaq 8+\ptregs_offset(%rsp), %rbp
orq $0x1, %rbp
#endif
.endm
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 5b21970..880bbb8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -168,7 +168,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+ sub $(7*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */

/*
* If we need to do entry work or if we guess we'll need to do
@@ -234,14 +234,14 @@ entry_SYSCALL_64_fastpath:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
jmp return_from_SYSCALL_64

entry_SYSCALL64_slow_path:
/* IRQs are off. */
SAVE_EXTRA_REGS
- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
call do_syscall_64 /* returns with IRQs disabled */

return_from_SYSCALL_64:
@@ -342,9 +342,9 @@ ENTRY(stub_ptregs_64)
* Called from fast path -- disable IRQs again, pop return address
* and jump to slow path
*/
+ popq %rax
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- popq %rax
jmp entry_SYSCALL64_slow_path

1:
@@ -409,13 +409,14 @@ END(__switch_to_asm)
*/
ENTRY(ret_from_fork)
movq %rax, %rdi
+ subq $8, %rsp
call schedule_tail /* rdi: 'prev' task parameter */

testq %rbx, %rbx /* from kernel_thread? */
jnz 1f /* kernel threads are uncommon */

2:
- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
@@ -494,10 +495,12 @@ END(irq_entries_start)
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
- movq %rsp, %rdi
+ movq %rsp, %rax
+ leaq 8(%rsp), %rdi
incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rdi
+ sub $8, %rsp
+ pushq %rax
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF

@@ -527,7 +530,7 @@ ret_from_intr:

/* Interrupt came from user space */
GLOBAL(retint_user)
- mov %rsp,%rdi
+ leaq 8(%rsp), %rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWAPGS
@@ -774,7 +777,7 @@ ENTRY(\sym)
.endif
.endif

- movq %rsp, %rdi /* pt_regs pointer */
+ leaq 8(%rsp), %rdi /* pt_regs pointer */

.if \has_error_code
movq ORIG_RAX(%rsp), %rsi /* get error code */
@@ -810,11 +813,11 @@ ENTRY(\sym)
call error_entry


- movq %rsp, %rdi /* pt_regs pointer */
+ leaq 8(%rsp), %rdi /* pt_regs pointer */
call sync_regs
- movq %rax, %rsp /* switch stack */
+ leaq -8(%rax), %rsp /* switch stack */

- movq %rsp, %rdi /* pt_regs pointer */
+ movq %rax, %rdi /* pt_regs pointer */

.if \has_error_code
movq ORIG_RAX(%rsp), %rsi /* get error code */
@@ -895,6 +898,7 @@ ENTRY(do_softirq_own_stack)
mov %rsp, %rbp
incl PER_CPU_VAR(irq_count)
cmove PER_CPU_VAR(irq_stack_ptr), %rsp
+ sub $8, %rsp
push %rbp /* frame pointer backlink */
call __do_softirq
leaveq
@@ -924,10 +928,11 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
* see the correct pointer to the pt_regs
*/
- movq %rdi, %rsp /* we don't return, adjust the stack frame */
+ leaq -8(%rdi), %rsp /* we don't return, adjust the stack frame */
11: incl PER_CPU_VAR(irq_count)
movq %rsp, %rbp
cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+ subq $8, %rsp
pushq %rbp /* frame pointer backlink */
call xen_evtchn_do_upcall
popq %rsp
@@ -1264,6 +1269,7 @@ ENTRY(nmi)
*/

movq %rsp, %rdi
+ subq $8, %rsp
movq $-1, %rsi
call do_nmi

@@ -1475,7 +1481,7 @@ end_repeat_nmi:
call paranoid_entry

/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
movq $-1, %rsi
call do_nmi

@@ -1519,7 +1525,7 @@ ENTRY(rewind_stack_do_exit)
xorl %ebp, %ebp

movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
- leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+ leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE-8(%rax), %rsp

call do_exit
1: jmp 1b
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721da..7d3f1e3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -89,6 +89,8 @@ ENTRY(entry_SYSENTER_compat)
pushq $0 /* pt_regs->r13 = 0 */
pushq $0 /* pt_regs->r14 = 0 */
pushq $0 /* pt_regs->r15 = 0 */
+
+ subq $8, %rsp
cld

/*
@@ -120,7 +122,7 @@ ENTRY(entry_SYSENTER_compat)
*/
TRACE_IRQS_OFF

- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
@@ -215,13 +217,15 @@ ENTRY(entry_SYSCALL_compat)
pushq $0 /* pt_regs->r14 = 0 */
pushq $0 /* pt_regs->r15 = 0 */

+ subq $8, %rsp
+
/*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
*/
TRACE_IRQS_OFF

- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
@@ -324,6 +328,8 @@ ENTRY(entry_INT80_compat)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+
+ subq $8, %rsp
cld

/*
@@ -332,7 +338,7 @@ ENTRY(entry_INT80_compat)
*/
TRACE_IRQS_OFF

- movq %rsp, %rdi
+ leaq 8(%rsp), %rdi
call do_int80_syscall_32
.Lsyscall_32_done:

diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index be36bf4..3c80aac 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -33,6 +33,7 @@
movq 8(%rbp), %rdi
.endif

+ sub $8, %rsp
call \func
jmp .L_restore
_ASM_NOKPROBE(\name)
@@ -58,6 +59,7 @@
|| defined(CONFIG_DEBUG_LOCK_ALLOC) \
|| defined(CONFIG_PREEMPT)
.L_restore:
+ add $8, %rsp
popq %r11
popq %r10
popq %r9
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b467b14..d03ab72 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -384,6 +384,8 @@ early_idt_handler_common:
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */

+ sub $8, %rsp
+
cmpq $14,%rsi /* Page fault? */
jnz 10f
GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */
@@ -392,7 +394,7 @@ early_idt_handler_common:
jz 20f /* All good */

10:
- movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
+ leaq 8(%rsp), %rdi /* RDI = pt_regs; RSI is already trapnr */
call early_fixup_exception

20:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf0c6d0..2af9f81 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -590,6 +590,7 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)

struct bad_iret_stack {
void *error_entry_ret;
+ void *padding;
struct pt_regs regs;
};

--
Email: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt