Re: [PATCH urgent v2] x86, asm: Disable opportunistic SYSRET if regs->flags has TF set

From: Denys Vlasenko
Date: Thu Apr 02 2015 - 11:49:17 EST


On 04/02/2015 02:59 PM, Denys Vlasenko wrote:
> On 04/02/2015 02:31 PM, Ingo Molnar wrote:
>> - we can optimize in a more directed fashion - like here
>>
>> ... while the downsides are:
>>
>> - more code
>> - a (small) chance of a fix going to one path while not the other.
>>
>> How much extra code would it be?
>
> A screenful or two.

I took a stab at it:

text data bss dec hex filename
12530 0 0 12530 30f2 entry_64.o2
12562 0 0 12562 3112 entry_64.o

The patch does two steps:

(1) copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label.

(2) remove "opportunistic sysret" code from "retint_swapgs" code block,
since now it won't be reached by syscall return. This in fact removes
most of the code in question.

Lightly run-tested so far.

Ingo, do you want this in a proper patch form?
--
vda


diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7bc097a..5ea2dd1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -354,8 +354,8 @@ GLOBAL(int_with_check)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
+ andl $~TS_COMPAT,TI_status(%rcx)
+ jmp syscall_return

/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
@@ -399,9 +399,86 @@ int_restore_rest:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
+
+syscall_return:
+ /* The iretq could re-enable interrupts: */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq RCX(%rsp),%rcx
+ cmpq %rcx,RIP(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- sysret checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ /*
+ * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET. This would cause an infinite loop whenever #DB happens
+ * with register state that satisfies the opportunistic SYSRET
+ * conditions. For example, single-stepping this user code:
+ *
+ * movq $stuck_here,%rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ CFI_REMEMBER_STATE
+ /* r11 is already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_R11
+ movq RSP(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+ SWAPGS
+ jmp restore_args
CFI_ENDPROC
END(system_call)

+
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
@@ -672,74 +749,6 @@ retint_swapgs: /* return to user-space */
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ

- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
- */
- movq RCX(%rsp),%rcx
- cmpq %rcx,RIP(%rsp) /* RCX == RIP */
- jne opportunistic_sysret_failed
-
- /*
- * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP. It's not worth
- * testing for canonicalness exactly -- this check detects any
- * of the 17 high bits set, which is true for non-canonical
- * or kernel addresses. (This will pessimize vsyscall=native.
- * Big deal.)
- *
- * If virtual addresses ever become wider, this will need
- * to be updated to remain correct on both old and new CPUs.
- */
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- sysret checks need update"
- .endif
- shr $__VIRTUAL_MASK_SHIFT, %rcx
- jnz opportunistic_sysret_failed
-
- cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
-
- movq R11(%rsp),%r11
- cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
-
- /*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
- *
- * movq $stuck_here,%rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
-
- /*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
- */
-irq_return_via_sysret:
- CFI_REMEMBER_STATE
- /* r11 is already restored (see code above) */
- RESTORE_C_REGS_EXCEPT_R11
- movq RSP(%rsp),%rsp
- USERGS_SYSRET64
- CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
SWAPGS
jmp restore_args

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/