Re: [RFC][PATCH 1/2] x86: Allow breakpoints to emulate call functions

From: Linus Torvalds
Date: Tue May 07 2019 - 11:01:16 EST


Duh.

I woke up this morning, realizing what was wrong with my patch.



On Mon, May 6, 2019 at 8:28 PM Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Yes. But I was looking at the ftrace parts because I didn't see the
> bug in the low-level x86 side, so...

There was nothing wrong in the *low-level* parts. There was one thing
wrong with the "in3_emulate_push()" code:

memmove(new, regs, SAVED_KERNEL_REGS_SIZE);

which ends up calling an out-of-line function. One that is traced. One
that will recursively result in 'int3'. Which will fill up the stack
until you get a triple fault and reboot.

Stupid stupid.

Anyway, changing that to just copying things one word at a time makes
everything work. The attached patch boots with the whole ftrace test
thing.The only change is literally changing that memmove() into

/* Inlined "memmove()" of the pt_regs */
unsigned long *p = (unsigned long *) new;
int i = SAVED_KERNEL_REGS_SIZE / sizeof(unsigned long);
do { *p = p[1]; p++; } while (--i);

which I'm not entirely proud of, but it sure is still simple.

And honestly, I absolutely despise PeterZ's patch. The notion that we
should suddenly say that "oh, the i386 kernel stack is odd" after 28
years of having that standard i386 stack is just crazy. And this:

arch/x86/entry/entry_32.S | 136 ++++++++++++++++++++++++++++-------
...
12 files changed, 323 insertions(+), 140 deletions(-)


vs this:

arch/x86/entry/entry_32.S | 7 +++-
...
6 files changed, 120 insertions(+), 13 deletions(-)

is still pretty damn conclusive. Not to mention that the simple
approach had a truly mindbogglingly simple solution with no actual
subtle changes anywhere else.

So I still claim that we should do my patch. Because it is SIMPLE.
It's straightforward, and I can explain every single line in it. Even
if I spent *way* too long until I realized that the "trivial"
memmove() wasn't so trivial.

Linus
arch/x86/entry/entry_32.S | 7 +++-
arch/x86/entry/entry_64.S | 14 ++++++--
arch/x86/include/asm/ftrace.h | 2 +-
arch/x86/include/asm/text-patching.h | 66 ++++++++++++++++++++++++++++++++++++
arch/x86/kernel/ftrace.c | 29 +++++++++++++---
arch/x86/kernel/traps.c | 15 +++++---
6 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d309f30cf7af..138ac432221b 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1483,7 +1483,12 @@ ENTRY(int3)
TRACE_IRQS_OFF
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
- call do_int3
+
+ # make room on kernel stack for push emulation
+ # do_kernel_int3 returns possibly updated pt_regs
+ subl $8, %esp
+ call do_kernel_int3
+ movl %eax, %esp
jmp ret_from_exception
END(int3)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..834ec1397dab 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
* @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8

@@ -899,6 +899,16 @@ ENTRY(\sym)
jnz .Lfrom_usermode_switch_stack_\@
.endif

+ .if \create_gap == 1
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+ .endif
+
.if \paranoid
call paranoid_entry
.else
@@ -1130,7 +1140,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
#endif /* CONFIG_HYPERV */

idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0
+idtentry int3 do_int3 has_error_code=0 create_gap=1
idtentry stack_segment do_stack_segment has_error_code=1

#ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index cf350639e76d..4b335ac5afcc 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -37,7 +37,7 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};

-int ftrace_int3_handler(struct pt_regs *regs);
+int ftrace_int3_handler(struct pt_regs **regs);

#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR

diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index e85ff65c43c3..c91fdecab9c2 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -39,4 +39,70 @@ extern int poke_int3_handler(struct pt_regs *regs);
extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
extern int after_bootmem;

+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+/*
+ * The 32-bit kernel entry exception stack does not contain the whole
+ * 'struct pt_regs', because the hardware will not push sp/ss when the
+ * protection level doesn't change.
+ *
+ * So pushing a value onto the stack requires us to make room for it by
+ * moving the whole truncated 'pt_regs' down by four bytes, and the
+ * stack we return to will be directly after it. The lowlevel x86-32
+ * entry code will have made room on the stack for this, so we're not
+ * overwriting anything else there.
+ *
+ * On x86-64, the exception stack is much simpler, and always contains
+ * sp/ss, so we can just update the values directly. Again, the entry
+ * code has made sure there is a gap (now above pt_regs!) that this is ok.
+ */
+static inline struct pt_regs *int3_emulate_push(struct pt_regs *regs, unsigned long value)
+{
+#ifdef CONFIG_X86_32
+#define SAVED_KERNEL_REGS_SIZE (offsetof(struct pt_regs, sp))
+ struct pt_regs *new = (void *)regs - sizeof(long);
+
+ /* Inlined "memmove()" of the pt_regs */
+ unsigned long *p = (unsigned long *) new;
+ int i = SAVED_KERNEL_REGS_SIZE / sizeof(unsigned long);
+ do { *p = p[1]; p++; } while (--i);
+
+ /*
+ * NOTE! '&new->sp' is actually the top of stack that that 'int3'
+ * will return to! So despite what this looks like, this doesn't
+ * update the stack pointer, but writing the value to 'new->sp'
+ * is actually writing the value *to* the top of the stack in
+ * the returning context!
+ *
+ * The thing that updates the stack pointer in the context we
+ * return to is the fact that we moved 'struct pt_regs' itself,
+ * and will be returning using that moved stack frame.
+ */
+ new->sp = value;
+ return new;
+#else
+ unsigned long *sp;
+
+ regs->sp -= sizeof(long);
+ sp = (unsigned long *)regs->sp;
+ *sp = value;
+ return regs;
+#endif
+}
+
+static inline struct pt_regs *int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+ unsigned long next_ip = regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE;
+
+ regs = int3_emulate_push(regs, next_ip);
+ regs->ip = func;
+ return regs;
+}
+
#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..6c239fca99d0 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,7 @@
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
+#include <asm/text-patching.h>

#ifdef CONFIG_DYNAMIC_FTRACE

@@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
}

static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;

static int update_ftrace_func(unsigned long ip, void *new)
{
@@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned char *new;
int ret;

+ ftrace_update_func_call = (unsigned long)func;
+
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);

@@ -287,20 +291,32 @@ static nokprobe_inline int is_ftrace_caller(unsigned long ip)
* call to a nop. While the change is taking place, we treat
* it just like it was a nop.
*/
-int ftrace_int3_handler(struct pt_regs *regs)
+int ftrace_int3_handler(struct pt_regs **pregs)
{
+ struct pt_regs *regs = *pregs;
unsigned long ip;

if (WARN_ON_ONCE(!regs))
return 0;

- ip = regs->ip - 1;
- if (!ftrace_location(ip) && !is_ftrace_caller(ip))
+ if (user_mode(regs))
return 0;

- regs->ip += MCOUNT_INSN_SIZE - 1;
+ ip = regs->ip - INT3_INSN_SIZE;

- return 1;
+ if (ftrace_location(ip)) {
+ *pregs = int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ if (!ftrace_update_func_call) {
+ int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+ return 1;
+ }
+ *pregs = int3_emulate_call(regs, ftrace_update_func_call);
+ return 1;
+ }
+
+ return 0;
}
NOKPROBE_SYMBOL(ftrace_int3_handler);

@@ -859,6 +875,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)

func = ftrace_ops_get_func(ops);

+ ftrace_update_func_call = (unsigned long)func;
+
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -960,6 +978,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;

+ ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);

return update_ftrace_func(ip, new);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d26f9e9c3d83..e811e55ab030 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,7 +570,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
NOKPROBE_SYMBOL(do_general_protection);

-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage struct pt_regs * notrace do_kernel_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
/*
@@ -578,11 +578,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
* See note by declaration of modifying_ftrace_code in ftrace.c
*/
if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
+ ftrace_int3_handler(&regs))
+ return regs;
#endif
if (poke_int3_handler(regs))
- return;
+ return regs;

/*
* Use ist_enter despite the fact that we don't use an IST stack.
@@ -614,6 +614,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)

exit:
ist_exit(regs);
+ return regs;
+}
+NOKPROBE_SYMBOL(do_kernel_int3);
+
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+{
+ WARN_ON_ONCE(do_kernel_int3(regs, error_code) != regs);
}
NOKPROBE_SYMBOL(do_int3);