[PATCH v4 12/16] x86/kprobes: Fix ordering

From: Peter Zijlstra
Date: Fri Oct 18 2019 - 03:51:57 EST


Kprobes does something like:

register:
arch_arm_kprobe()
text_poke(INT3)
/* guarantees nothing, INT3 will become visible at some point, maybe */

kprobe_optimizer()
/* guarantees the bytes after INT3 are unused */
syncrhonize_rcu_tasks();
text_poke_bp(JMP32);
/* implies IPI-sync, kprobe really is enabled */


unregister:
__disarm_kprobe()
unoptimize_kprobe()
text_poke_bp(INT3 + tail);
/* implies IPI-sync, so tail is guaranteed visible */
arch_disarm_kprobe()
text_poke(old);
/* guarantees nothing, old will maybe become visible */

synchronize_rcu()

free-stuff

Now the problem is that on register, the synchronize_rcu_tasks() does
not imply sufficient to guarantee all CPUs have already observed INT3
(although in practise this is exceedingly unlikely not to have
happened) (similar to how MEMBARRIER_CMD_PRIVATE_EXPEDITED does not
imply MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE).

Worse, even if it did, we'd have to do 2 synchronize calls to provide
the guarantee we're looking for, the first to ensure INT3 is visible,
the second to guarantee nobody is then still using the instruction
bytes after INT3.

Similar on unregister; the synchronize_rcu() between
__unregister_kprobe_top() and __unregister_kprobe_bottom() does not
guarantee all CPUs are free of the INT3 (and observe the old text).

Therefore, sprinkle some IPI-sync love around. This guarantees that
all CPUs agree on the text and RCU once again provides the required
guaranteed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: hpa@xxxxxxxxx
Cc: paulmck@xxxxxxxxxx
Cc: mathieu.desnoyers@xxxxxxxxxxxx
---
arch/x86/include/asm/text-patching.h | 1 +
arch/x86/kernel/alternative.c | 11 ++++++++---
arch/x86/kernel/kprobes/core.c | 2 ++
arch/x86/kernel/kprobes/opt.c | 12 ++++--------
4 files changed, 15 insertions(+), 11 deletions(-)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -42,6 +42,7 @@ extern void text_poke_early(void *addr,
* an inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -936,6 +936,11 @@ static void do_sync_core(void *info)
sync_core();
}

+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
struct text_poke_loc {
s32 rel_addr; /* addr := _stext + rel_addr */
s32 rel32;
@@ -1085,7 +1090,7 @@ static void text_poke_bp_batch(struct te
for (i = 0; i < nr_entries; i++)
text_poke(text_poke_addr(&tp[i]), &int3, sizeof(int3));

- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();

/*
* Second step: update all but the first byte of the patched range.
@@ -1107,7 +1112,7 @@ static void text_poke_bp_batch(struct te
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
}

/*
@@ -1123,7 +1128,7 @@ static void text_poke_bp_batch(struct te
}

if (do_sync)
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();

/*
* sync_core() implies an smp_mb() and orders this store against
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -502,11 +502,13 @@ int arch_prepare_kprobe(struct kprobe *p
void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ text_poke_sync();
}

void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
+ text_poke_sync();
}

void arch_remove_kprobe(struct kprobe *p)
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -444,14 +444,10 @@ void arch_optimize_kprobes(struct list_h
/* Replace a relative jump with a breakpoint (int3). */
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- u8 insn_buff[JMP32_INSN_SIZE];
-
- /* Set int3 to first byte for kprobes */
- insn_buff[0] = INT3_INSN_OPCODE;
- memcpy(insn_buff + 1, op->optinsn.copied_insn, DISP32_SIZE);
-
- text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE,
- text_gen_insn(JMP32_INSN_OPCODE, op->kp.addr, op->optinsn.insn));
+ arch_arm_kprobe(&op->kp);
+ text_poke(op->kp.addr + INT3_INSN_SIZE,
+ op->optinsn.copied_insn, DISP32_SIZE);
+ text_poke_sync();
}

/*