Re: [RFC][PATCH 1/2] x86: Allow breakpoints to emulate call functions

From: Linus Torvalds
Date: Sat May 04 2019 - 15:05:10 EST


On Fri, May 3, 2019 at 10:08 PM Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> I'll look at it tomorrow, but I think this actually makes unnecessary changes.
>
> In particular, I think we could keep the existing entry code almost unchanged with this whole approach.

So here's what I *think* should work. Note that I also removed your
test-case code, because it really didn't have a chance in hell of
working. Doing that

int3_emulate_call(regs, (unsigned long)&int3_magic);

inside of int3_exception_notify() could not possibly be valid, since
int3_emulate_call() returns the new pt_regs that need to be used, and
throwing it away is clearly wrong.

So you can't use a register_die_notifier() to try to intercept the
'int3' error and then do it manually, it needs to be done by the
ftrace_int3_handler() code that actually returns the new regs, and
where do_kernel_int3() will then return it to the low-level handler.

End result: I haven't actually tested this code, but I've looked
through the patch something like ten times without finding any new
errors.

I've also tried *very* hard to make the patch minimal, with the
exception of the comments, which I tried to make extensive for any of
the subtle cases.

But without testing, it's probably still buggy.

I have to say, I finally like the end result here. Maybe it's because
I got to make my mark and pee in the snow, but I will say that

(a) the actual entry code modifications really are minimal now

(b) the instruction emulation really is very simple and straightforward

(c) yes, we play some stack tricks (and yes, we play them differently
on x86-64 and x86-32), but the tricks are again at least
straightforward, and we never really change the layout of any stack.

So on the whole, I think this is about as good as it gets. Did I get
all the details actually right, and it _works_? I guess we'll see.

Linus
arch/x86/entry/entry_32.S | 7 +++-
arch/x86/entry/entry_64.S | 14 ++++++--
arch/x86/include/asm/ftrace.h | 2 +-
arch/x86/include/asm/text-patching.h | 62 ++++++++++++++++++++++++++++++++++++
arch/x86/kernel/ftrace.c | 29 ++++++++++++++---
arch/x86/kernel/traps.c | 15 ++++++---
6 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d309f30cf7af..138ac432221b 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1483,7 +1483,12 @@ ENTRY(int3)
TRACE_IRQS_OFF
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
- call do_int3
+
+ # make room on kernel stack for push emulation
+ # do_kernel_int3 returns possibly updated pt_regs
+ subl $8, %esp
+ call do_kernel_int3
+ movl %eax, %esp
jmp ret_from_exception
END(int3)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..834ec1397dab 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
* @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8

@@ -899,6 +899,16 @@ ENTRY(\sym)
jnz .Lfrom_usermode_switch_stack_\@
.endif

+ .if \create_gap == 1
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+ .endif
+
.if \paranoid
call paranoid_entry
.else
@@ -1130,7 +1140,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
#endif /* CONFIG_HYPERV */

idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0
+idtentry int3 do_int3 has_error_code=0 create_gap=1
idtentry stack_segment do_stack_segment has_error_code=1

#ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index cf350639e76d..4b335ac5afcc 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -37,7 +37,7 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};

-int ftrace_int3_handler(struct pt_regs *regs);
+int ftrace_int3_handler(struct pt_regs **regs);

#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR

diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index e85ff65c43c3..87fad9f5c31b 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -39,4 +39,66 @@ extern int poke_int3_handler(struct pt_regs *regs);
extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
extern int after_bootmem;

+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+/*
+ * The 32-bit kernel entry exception stack does not contain the whole
+ * 'struct pt_regs', because the hardware will not push sp/ss when the
+ * protection level doesn't change.
+ *
+ * So pushing a value onto the stack requires us to make room for it by
+ * moving the whole truncated 'pt_regs' down by four bytes, and the
+ * stack we return to will be directly after it. The lowlevel x86-32
+ * entry code will have made room on the stack for this, so we're not
+ * overwriting anything else there.
+ *
+ * On x86-64, the exception stack is much simpler, and always contains
+ * sp/ss, so we can just update the values directly. Again, the entry
+ * code has made sure there is a gap (now above pt_regs!) that this is ok.
+ */
+static inline struct pt_regs *int3_emulate_push(struct pt_regs *regs, unsigned long value)
+{
+#ifdef CONFIG_X86_32
+#define SAVED_KERNEL_REGS_SIZE (offsetof(struct pt_regs, sp))
+ struct pt_regs *new = (void *)regs - sizeof(long);
+ memmove(new, regs, SAVED_KERNEL_REGS_SIZE);
+
+ /*
+ * NOTE! '&new->sp' is actually the top of stack that that 'int3'
+ * will return to! So despite what this looks like, this doesn't
+ * update the stack pointer, but writing the value to 'new->sp'
+ * is actually writing the value *to* the top of the stack in
+ * the returning context!
+ *
+ * The thing that updates the stack pointer in the context we
+ * return to is the fact that we moved 'struct pt_regs' itself,
+ * and will be returning using that moved stack frame.
+ */
+ new->sp = value;
+ return new;
+#else
+ unsigned long *sp;
+
+ regs->sp -= sizeof(long);
+ sp = (unsigned long *)regs->sp;
+ *sp = value;
+ return regs;
+#endif
+}
+
+static inline struct pt_regs *int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+ unsigned long next_ip = regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE;
+
+ regs = int3_emulate_push(regs, next_ip);
+ regs->ip = func;
+ return regs;
+}
+
#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..6c239fca99d0 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,7 @@
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
+#include <asm/text-patching.h>

#ifdef CONFIG_DYNAMIC_FTRACE

@@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
}

static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;

static int update_ftrace_func(unsigned long ip, void *new)
{
@@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned char *new;
int ret;

+ ftrace_update_func_call = (unsigned long)func;
+
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);

@@ -287,20 +291,32 @@ static nokprobe_inline int is_ftrace_caller(unsigned long ip)
* call to a nop. While the change is taking place, we treat
* it just like it was a nop.
*/
-int ftrace_int3_handler(struct pt_regs *regs)
+int ftrace_int3_handler(struct pt_regs **pregs)
{
+ struct pt_regs *regs = *pregs;
unsigned long ip;

if (WARN_ON_ONCE(!regs))
return 0;

- ip = regs->ip - 1;
- if (!ftrace_location(ip) && !is_ftrace_caller(ip))
+ if (user_mode(regs))
return 0;

- regs->ip += MCOUNT_INSN_SIZE - 1;
+ ip = regs->ip - INT3_INSN_SIZE;

- return 1;
+ if (ftrace_location(ip)) {
+ *pregs = int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ if (!ftrace_update_func_call) {
+ int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+ return 1;
+ }
+ *pregs = int3_emulate_call(regs, ftrace_update_func_call);
+ return 1;
+ }
+
+ return 0;
}
NOKPROBE_SYMBOL(ftrace_int3_handler);

@@ -859,6 +875,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)

func = ftrace_ops_get_func(ops);

+ ftrace_update_func_call = (unsigned long)func;
+
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -960,6 +978,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;

+ ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);

return update_ftrace_func(ip, new);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d26f9e9c3d83..e811e55ab030 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,7 +570,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
NOKPROBE_SYMBOL(do_general_protection);

-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage struct pt_regs * notrace do_kernel_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
/*
@@ -578,11 +578,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
* See note by declaration of modifying_ftrace_code in ftrace.c
*/
if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
+ ftrace_int3_handler(&regs))
+ return regs;
#endif
if (poke_int3_handler(regs))
- return;
+ return regs;

/*
* Use ist_enter despite the fact that we don't use an IST stack.
@@ -614,6 +614,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)

exit:
ist_exit(regs);
+ return regs;
+}
+NOKPROBE_SYMBOL(do_kernel_int3);
+
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+{
+ WARN_ON_ONCE(do_kernel_int3(regs, error_code) != regs);
}
NOKPROBE_SYMBOL(do_int3);