[PATCH v4] kretprobe: percpu support

From: Luigi Rizzo
Date: Fri Feb 21 2020 - 16:17:15 EST


kretprobe uses a list protected by a single lock to allocate a
kretprobe_instance in pre_handler_kretprobe(). This works poorly with
concurrent calls.

This patch offers a simplified fix: the percpu_instance flag indicates
that kretprobe_instance's are per CPU, which makes allocation lock free.

As part of this patch, we factor out the code to allocate instances in
get_pcpu_rp_instance() and get_rp_instance().

At the moment we only allow one pending kretprobe per CPU. This can be
extended to a small constant number of entries, but finding a free entry
would either bring back the lock, or require scanning an array, which can
be expensive (especially if callers block and migrate).

Signed-off-by: Luigi Rizzo <lrizzo@xxxxxxxxxx>
---
include/linux/kprobes.h | 8 +++-
kernel/kprobes.c | 102 +++++++++++++++++++++++++++++++---------
kernel/test_kprobes.c | 20 ++++++--
3 files changed, 101 insertions(+), 29 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 04bdaf01112cb..c5894f4c5c26c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -142,6 +142,8 @@ static inline int kprobe_ftrace(struct kprobe *p)
* can be active concurrently.
* nmissed - tracks the number of times the probed function's return was
* ignored, due to maxactive being too low.
+ * percpu_instance - if set, uses one instance per cpu instead of allocating
+ * from the list protected by lock.
*
*/
struct kretprobe {
@@ -151,8 +153,12 @@ struct kretprobe {
int maxactive;
int nmissed;
size_t data_size;
- struct hlist_head free_instances;
+ union {
+ struct kretprobe_instance __percpu *pcpu;
+ struct hlist_head free_instances;
+ };
raw_spinlock_t lock;
+ bool percpu_instance;
};

struct kretprobe_instance {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2625c241ac00f..46018a4a76178 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1184,6 +1184,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri,
hlist_del(&ri->hlist);
INIT_HLIST_NODE(&ri->hlist);
if (likely(rp)) {
+ if (rp->percpu_instance) {
+ ri->rp = NULL;
+ return;
+ }
raw_spin_lock(&rp->lock);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock(&rp->lock);
@@ -1274,6 +1278,11 @@ static inline void free_rp_inst(struct kretprobe *rp)
struct kretprobe_instance *ri;
struct hlist_node *next;

+ if (rp->percpu_instance) {
+ free_percpu(rp->pcpu);
+ return;
+ }
+
hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
@@ -1851,6 +1860,46 @@ unsigned long __weak arch_deref_entry_point(void *entry)
}

#ifdef CONFIG_KRETPROBES
+struct kretprobe_instance *get_pcpu_rp_instance(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ri = this_cpu_ptr(rp->pcpu);
+ if (!ri || ri->rp) { /* already in use */
+ local_irq_restore(flags);
+ rp->nmissed++;
+ return NULL;
+ }
+ INIT_HLIST_NODE(&ri->hlist);
+ ri->rp = rp;
+ ri->task = current;
+ local_irq_restore(flags);
+ return ri;
+}
+
+struct kretprobe_instance *get_rp_instance(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rp->lock, flags);
+ if (hlist_empty(&rp->free_instances)) {
+ rp->nmissed++;
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+ return NULL;
+ }
+ ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance,
+ hlist);
+ hlist_del(&ri->hlist);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+
+ ri->rp = rp;
+ ri->task = current;
+ return ri;
+}
+
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -1873,35 +1922,32 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}

/* TODO: consider to only swap the RA after the last pre_handler fired */
- hash = hash_ptr(current, KPROBE_HASH_BITS);
- raw_spin_lock_irqsave(&rp->lock, flags);
- if (!hlist_empty(&rp->free_instances)) {
- ri = hlist_entry(rp->free_instances.first,
- struct kretprobe_instance, hlist);
- hlist_del(&ri->hlist);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
-
- ri->rp = rp;
- ri->task = current;
-
- if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ if (rp->percpu_instance) {
+ ri = get_pcpu_rp_instance(rp);
+ } else {
+ ri = get_rp_instance(rp);
+ }
+ if (!ri)
+ return 0;
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ if (rp->percpu_instance) {
+ ri->rp = NULL;
+ } else {
raw_spin_lock_irqsave(&rp->lock, flags);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock_irqrestore(&rp->lock, flags);
- return 0;
}
+ return 0;
+ }
+ arch_prepare_kretprobe(ri, regs);

- arch_prepare_kretprobe(ri, regs);
+ /* XXX(hch): why is there no hlist_move_head? */
+ INIT_HLIST_NODE(&ri->hlist);
+ hash = hash_ptr(current, KPROBE_HASH_BITS);
+ kretprobe_table_lock(hash, &flags);
+ hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+ kretprobe_table_unlock(hash, &flags);

- /* XXX(hch): why is there no hlist_move_head? */
- INIT_HLIST_NODE(&ri->hlist);
- kretprobe_table_lock(hash, &flags);
- hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
- kretprobe_table_unlock(hash, &flags);
- } else {
- rp->nmissed++;
- raw_spin_unlock_irqrestore(&rp->lock, flags);
- }
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
@@ -1950,6 +1996,15 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;

+ if (rp->percpu_instance) {
+ rp->pcpu = __alloc_percpu(sizeof(*rp->pcpu) + rp->data_size,
+ __alignof__(*rp->pcpu));
+ if (rp->pcpu)
+ goto finalize;
+ free_rp_inst(rp);
+ return -ENOMEM;
+ }
+
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
@@ -1971,6 +2026,7 @@ int register_kretprobe(struct kretprobe *rp)
hlist_add_head(&inst->hlist, &rp->free_instances);
}

+finalize:
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 76c997fdbc9da..4ebe48301d743 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -236,31 +236,36 @@ static struct kretprobe rp2 = {
.kp.symbol_name = "kprobe_target2"
};

-static int test_kretprobes(void)
+static int test_kretprobes(bool percpu_instance)
{
int ret;
struct kretprobe *rps[2] = {&rp, &rp2};
+ const char *mode = percpu_instance ? "percpu " : "normal";

/* addr and flags should be cleard for reusing kprobe. */
rp.kp.addr = NULL;
rp.kp.flags = 0;
+ rp.percpu_instance = percpu_instance;
+ rp2.kp.addr = NULL;
+ rp2.kp.flags = 0;
+ rp2.percpu_instance = percpu_instance;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe mode %s returned %d\n", mode, ret);
return ret;
}

krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
+ pr_err("kretprobe handler mode %s not called\n", mode);
handler_errors++;
}

krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- pr_err("kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 mode %s not called\n", mode);
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -297,7 +302,12 @@ int init_test_probes(void)
errors++;

num_tests++;
- ret = test_kretprobes();
+ ret = test_kretprobes(false);
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_kretprobes(true);
if (ret < 0)
errors++;
#endif /* CONFIG_KRETPROBES */
--
2.25.0.265.gbab2e86ba0-goog