[RFC 22/30] x86/entry/compat: Implement opportunistic SYSRETL for compat syscalls

From: Andy Lutomirski
Date: Tue Sep 01 2015 - 18:46:40 EST


If CS, SS and IP are as expected and FLAGS is compatible with SYSRETL,
then return from fast compat syscalls (both SYSCALL and SYSENTER) using
SYSRETL.

Unlike native 64-bit opportunistic SYSRET, this is not invisible to
user code: RCX and R8-R15 end up in a different state than shown
saved in pt_regs. To compensate, we only do this when returning to
the vDSO fast syscall return path. This won't interfere with
syscall restart, as we won't use SYSRETL when returning to the INT80
restart instruction.

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
arch/x86/entry/common.c | 23 +++++++++++++++++++---
arch/x86/entry/entry_64_compat.S | 42 ++++++++++++++++++++++++++++++++++++++--
2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 5725cdcec4de..9182c69f860b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -363,7 +363,8 @@ __visible void do_int80_syscall_32(struct pt_regs *regs)
syscall_return_slowpath(regs);
}

-__visible void do_fast_syscall_32(struct pt_regs *regs)
+/* Returns 0 to return using IRET or 1 to return using SYSRETL. */
+__visible long do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER calling convention.
@@ -391,12 +392,28 @@ __visible void do_fast_syscall_32(struct pt_regs *regs)
enter_from_user_mode();
#endif
prepare_exit_to_usermode(regs);
- return;
+ return 0; /* Keep it simple: use IRET. */
}
local_irq_disable();

/* Now this is just like a normal syscall. */
do_int80_syscall_32(regs);
- return;
+
+#ifdef CONFIG_X86_64
+ /*
+ * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
+ * SYSRETL is available on all 64-bit CPUs, so we don't need to
+ * bother with SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ */
+ return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
+#else
+ return 0;
+#endif
}
#endif
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 1c8ac2e64a1e..db0babfec0d9 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -115,7 +115,9 @@ sysenter_flags_fixed:

movq %rsp, %rdi
call do_fast_syscall_32
- jmp .Lsyscall_32_done
+ testl %eax, %eax
+ jz .Lsyscall_32_done
+ jmp sysret32_from_system_call

sysenter_fix_flags:
pushq $X86_EFLAGS_FIXED
@@ -192,7 +194,43 @@ ENTRY(entry_SYSCALL_compat)

movq %rsp, %rdi
call do_fast_syscall_32
- jmp .Lsyscall_32_done
+ testl %eax, %eax
+ jz .Lsyscall_32_done
+
+ /* Opportunistic SYSRET */
+sysret32_from_system_call:
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ movq RBX(%rsp), %rbx /* pt_regs->rbx */
+ movq RBP(%rsp), %rbp /* pt_regs->rbp */
+ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
+ movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */
+ addq $RAX, %rsp /* Skip r8-r15 */
+ popq %rax /* pt_regs->rax */
+ popq %rdx /* Skip pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+
+ /*
+ * USERGS_SYSRET32 does:
+ * GSBASE = user's GS base
+ * EIP = ECX
+ * RFLAGS = R11
+ * CS = __USER32_CS
+ * SS = __USER_DS
+ *
+ * ECX will not match pt_regs->cx, but we're returning to a vDSO
+ * trampoline that will fix up RCX, so this is okay.
+ *
+ * R12-R15 are callee-saved, so they contain whatever was in them
+ * when the system call started, which is already known to user
+ * code. We zero R8-R10 to avoid info leaks.
+ */
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r10
+ movq RSP-ORIG_RAX(%rsp), %rsp
+ USERGS_SYSRET32
END(entry_SYSCALL_compat)

/*
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/