Re: [RFC][PATCH 1/2] x86: Allow breakpoints to emulate call functions

From: Peter Zijlstra
Date: Mon May 06 2019 - 04:21:13 EST


On Fri, May 03, 2019 at 11:57:22AM -0700, Linus Torvalds wrote:
> On Fri, May 3, 2019 at 9:21 AM Andy Lutomirski <luto@xxxxxxxxxxxxxx> wrote:
> >
> > So hereâs a somewhat nutty suggestion: how about we tweak the 32-bit
> > entry code to emulate the sane 64-bit frame, not just for int3 but
> > always?
>
> What would the code actually end up looking like? I don't necessarily
> object, since that kernel_stack_pointer() thing certainly looks
> horrible, but honestly, my suggestion to just pass in the 'struct
> pt_regs' and let the call emulation fix it up would have also worked,
> and avoided that bug (and who knows what else might be hiding).
>
> I really think that you're now hitting all the special case magic
> low-level crap that I wanted to avoid.

This did actually boot on first try; so there must be something horribly
wrong...

Now, I know you like that other approach; but I figured I should at
least show you what this one looks like. Maybe I've been staring at
entry_32.S too much, but I really don't dislike this.

---
arch/x86/entry/entry_32.S | 150 +++++++++++++++++++++++++++++------
arch/x86/entry/entry_64.S | 14 +++-
arch/x86/include/asm/ptrace.h | 4 -
arch/x86/include/asm/text-patching.h | 20 +++++
arch/x86/kernel/alternative.c | 81 ++++++++++++++++++-
arch/x86/kernel/ptrace.c | 29 -------
6 files changed, 235 insertions(+), 63 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7b23431be5cb..d29cf03219c5 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -67,9 +67,20 @@
# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
# define preempt_stop(clobbers)
-# define resume_kernel restore_all_kernel
#endif

+.macro RETINT_PREEMPT
+#ifdef CONFIG_PREEMPT
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ cmpl $0, PER_CPU_VAR(__preempt_count)
+ jnz .Lend_\@
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz .Lend_\@
+ call preempt_schedule_irq
+.Lend_\@:
+#endif
+.endm
+
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
@@ -203,8 +214,119 @@
.Lend_\@:
.endm

+#define CS_FROM_ENTRY_STACK (1 << 31)
+#define CS_FROM_USER_CR3 (1 << 30)
+#define CS_FROM_KERNEL (1 << 29)
+
+.macro FIXUP_FRAME
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, 3*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+#endif
+ testl $SEGMENT_RPL_MASK, 2*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+
+ orl $CS_FROM_KERNEL, 2*4(%esp)
+
+ /*
+ * When we're here from kernel mode; the (exception) stack looks like:
+ *
+ * 4*4(%esp) - <previous context>
+ * 3*4(%esp) - flags
+ * 2*4(%esp) - cs
+ * 1*4(%esp) - ip
+ * 0*4(%esp) - orig_eax
+ *
+ * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+ * is complete and in particular regs->sp is correct. This gives us
+ * the original 4 enties as gap:
+ *
+ * 10*4(%esp) - <previous context>
+ * 9*4(%esp) - gap / flags
+ * 8*4(%esp) - gap / cs
+ * 7*4(%esp) - gap / ip
+ * 6*4(%esp) - gap / orig_eax
+ * 5*4(%esp) - ss
+ * 4*4(%esp) - sp
+ * 3*4(%esp) - flags
+ * 2*4(%esp) - cs
+ * 1*4(%esp) - ip
+ * 0*4(%esp) - orig_eax
+ */
+
+ pushl %ss # ss
+ pushl %esp # sp (points at ss)
+ add $5*4, (%esp) # point sp back at the previous context
+ pushl 5*4(%esp) # flags
+ pushl 5*4(%esp) # cs
+ pushl 5*4(%esp) # ip
+ pushl 5*4(%esp) # orig_eax
+
+.Lfrom_usermode_no_fixup_\@:
+.endm
+
+.macro IRET_FRAME
+
+ /* orig_eax is already POP'ed when we're here */
+
+ testl $CS_FROM_KERNEL, 1*4(%esp)
+ jz .Lfinished_frame_\@
+
+ pushl %eax
+
+ lea 10*4(%esp), %eax # address of <previous context>
+ cmpl %eax, 4*4(%esp) # if ->sp is unmodified
+ jnz .Lmodified_sp_do_fixup_\@
+
+ /*
+ * Fast path; regs->sp wasn't modified, reuse the original IRET frame.
+ */
+ pop %eax
+ add $6*4, %esp
+ jmp .Lfinished_frame_\@;
+
+.Lmodified_sp_do_fixup_\@:
+
+ /*
+ * Reconstruct the 3 entry IRET frame right after the (modified)
+ * regs->sp without lowering %esp in between, such that an NMI in the
+ * middle doesn't scribble our stack.
+ */
+ pushl %ecx
+ movl 5*4(%esp), %eax # (modified) regs->sp
+
+ movl 4*4(%esp), %ecx # flags
+ movl %ecx, -4(%eax)
+
+ movl 3*4(%esp), %ecx # cs
+ andl $0x0000ffff, %ecx
+ movl %ecx, -8(%eax)
+
+ movl 2*4(%esp), %ecx # ip
+ movl %ecx, -12(%eax)
+
+ movl 1*4(%esp), %ecx # eax
+ movl %ecx, -16(%eax)
+
+ popl %ecx
+ lea -16(%eax), %esp
+ popl %eax
+
+.Lfinished_frame_\@:
+.endm
+
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+
cld
+ /*
+ * The high bits of the CS dword (__csh) are used for CS_FROM_*.
+ * Clear them in case hardware didn't do this for us.
+ */
+ andl $(0x0000ffff), 2*4(%esp)
+
+ FIXUP_FRAME
+
PUSH_GS
pushl %fs
pushl %es
@@ -375,9 +497,6 @@
* switch to it before we do any copying.
*/

-#define CS_FROM_ENTRY_STACK (1 << 31)
-#define CS_FROM_USER_CR3 (1 << 30)
-
.macro SWITCH_TO_KERNEL_STACK

ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
@@ -391,13 +510,6 @@
* that register for the time this macro runs
*/

- /*
- * The high bits of the CS dword (__csh) are used for
- * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
- * hardware didn't do this for us.
- */
- andl $(0x0000ffff), PT_CS(%esp)
-
/* Are we on the entry stack? Bail out if not! */
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
@@ -755,7 +867,7 @@ END(ret_from_fork)
andl $SEGMENT_RPL_MASK, %eax
#endif
cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
+ jb restore_all_kernel # not returning to v8086 or userspace

ENTRY(resume_userspace)
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -765,18 +877,6 @@ ENTRY(resume_userspace)
jmp restore_all
END(ret_from_exception)

-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz restore_all_kernel
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all_kernel
- call preempt_schedule_irq
- jmp restore_all_kernel
-END(resume_kernel)
-#endif
-
GLOBAL(__begin_SYSENTER_singlestep_region)
/*
* All code from here through __end_SYSENTER_singlestep_region is subject
@@ -1019,6 +1119,7 @@ ENTRY(entry_INT80_32)
/* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code
.Lirq_return:
+ IRET_FRAME
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler and when returning from
@@ -1027,6 +1128,7 @@ ENTRY(entry_INT80_32)
INTERRUPT_RETURN

restore_all_kernel:
+ RETINT_PREEMPT
TRACE_IRQS_IRET
PARANOID_EXIT_TO_KERNEL_MODE
BUG_IF_WRONG_CR3
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 20e45d9b4e15..268cd9affe04 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
* @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8

@@ -898,6 +898,16 @@ ENTRY(\sym)
jnz .Lfrom_usermode_switch_stack_\@
.endif

+ .if \create_gap == 1
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+ .endif
+
.if \paranoid
call paranoid_entry
.else
@@ -1129,7 +1139,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
#endif /* CONFIG_HYPERV */

idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
-idtentry int3 do_int3 has_error_code=0
+idtentry int3 do_int3 has_error_code=0 create_gap=1
idtentry stack_segment do_stack_segment has_error_code=1

#ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 8a7fc0cca2d1..5ff42dc8b396 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -166,14 +166,10 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#define compat_user_stack_pointer() current_pt_regs()->sp
#endif

-#ifdef CONFIG_X86_32
-extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
-#else
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
-#endif

#define GET_IP(regs) ((regs)->ip)
#define GET_FP(regs) ((regs)->bp)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index c90678fd391a..6aac6abf931e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -42,4 +42,24 @@ extern int after_bootmem;
extern __ro_after_init struct mm_struct *poking_mm;
extern __ro_after_init unsigned long poking_addr;

+static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+{
+ regs->sp -= sizeof(unsigned long);
+ *(unsigned long *)regs->sp = val;
+}
+
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
+ int3_emulate_jmp(regs, func);
+}
+
#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4db9c0d29bc1..9519bc553d6d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,11 +613,83 @@ extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */

+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
+static void __init int3_magic(unsigned int *ptr)
+{
+ *ptr = 1;
+}
+
+extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+
+ if (!regs || user_mode(regs))
+ return NOTIFY_DONE;
+
+ if (val != DIE_INT3)
+ return NOTIFY_DONE;
+
+ if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
+ return NOTIFY_DONE;
+
+ int3_emulate_call(regs, (unsigned long)&int3_magic);
+ return NOTIFY_STOP;
+}
+
+static void __init int3_selftest(void)
+{
+ static __initdata struct notifier_block int3_exception_nb = {
+ .notifier_call = int3_exception_notify,
+ .priority = INT_MAX-1, /* last */
+ };
+ unsigned int val = 0;
+
+ BUG_ON(register_die_notifier(&int3_exception_nb));
+
+ /*
+ * Basically: int3_magic(&val); but really complicated :-)
+ *
+ * Stick the address of the INT3 instruction into int3_selftest_ip,
+ * then trigger the INT3, padded with NOPs to match a CALL instruction
+ * length.
+ */
+ asm volatile ("1: int3; nop; nop; nop; nop\n\t"
+ ".pushsection .init.data,\"aw\"\n\t"
+ ".align " __ASM_SEL(4, 8) "\n\t"
+ ".type int3_selftest_ip, @object\n\t"
+ ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
+ "int3_selftest_ip:\n\t"
+ __ASM_SEL(.long, .quad) " 1b\n\t"
+ ".popsection\n\t"
+ : : __ASM_SEL_RAW(a, D) (&val) : "memory");
+
+ BUG_ON(val != 1);
+
+ unregister_die_notifier(&int3_exception_nb);
+}
+
void __init alternative_instructions(void)
{
- /* The patching is not fully atomic, so try to avoid local interruptions
- that might execute the to be patched code.
- Other CPUs are not running. */
+ int3_selftest();
+
+ /*
+ * The patching is not fully atomic, so try to avoid local
+ * interruptions that might execute the to be patched code.
+ * Other CPUs are not running.
+ */
stop_nmi();

/*
@@ -642,10 +714,11 @@ void __init alternative_instructions(void)
_text, _etext);
}

- if (!uniproc_patched || num_possible_cpus() == 1)
+ if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+ }
#endif

apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4b8ee05dd6ad..d13f892d2c47 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -153,35 +153,6 @@ static inline bool invalid_selector(u16 value)

#define FLAG_MASK FLAG_MASK_32

-/*
- * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
- * when it traps. The previous stack will be directly underneath the saved
- * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
- *
- * Now, if the stack is empty, '&regs->sp' is out of range. In this
- * case we try to take the previous stack. To always return a non-null
- * stack pointer we fall back to regs as stack if no previous stack
- * exists.
- *
- * This is valid only for kernel mode traps.
- */
-unsigned long kernel_stack_pointer(struct pt_regs *regs)
-{
- unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
- unsigned long sp = (unsigned long)&regs->sp;
- u32 *prev_esp;
-
- if (context == (sp & ~(THREAD_SIZE - 1)))
- return sp;
-
- prev_esp = (u32 *)(context);
- if (*prev_esp)
- return (unsigned long)*prev_esp;
-
- return (unsigned long)regs;
-}
-EXPORT_SYMBOL_GPL(kernel_stack_pointer);
-
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);