[PATCH 3/6] perf: implement kprobe support to PERF_TYPE_PROBE

From: Song Liu
Date: Wed Nov 15 2017 - 12:25:44 EST


A new pmu, perf_probe, is created for PERF_TYPE_PROBE. Based on
input from perf_event_open(), perf_probe creates a kprobe (or
kretprobe) for the perf_event. This kprobe is private to this
perf_event, and thus not added to global lists, and not
available in tracefs.

Two functions, create_local_trace_kprobe() and
destroy_local_trace_kprobe() are added to created and destroy these
local trace_kprobe.

Signed-off-by: Song Liu <songliubraving@xxxxxx>
Reviewed-by: Yonghong Song <yhs@xxxxxx>
Reviewed-by: Josef Bacik <jbacik@xxxxxx>
---
include/linux/trace_events.h | 2 +
kernel/events/core.c | 41 +++++++++++++++++--
kernel/trace/trace_event_perf.c | 81 ++++++++++++++++++++++++++++++++++++
kernel/trace/trace_kprobe.c | 91 +++++++++++++++++++++++++++++++++++++----
kernel/trace/trace_probe.h | 7 ++++
5 files changed, 211 insertions(+), 11 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bcb4dc..743e68d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -494,6 +494,8 @@ extern int perf_trace_init(struct perf_event *event);
extern void perf_trace_destroy(struct perf_event *event);
extern int perf_trace_add(struct perf_event *event, int flags);
extern void perf_trace_del(struct perf_event *event, int flags);
+extern int perf_probe_init(struct perf_event *event);
+extern void perf_probe_destroy(struct perf_event *event);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81dd57b..95c6610 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7966,6 +7966,28 @@ static int perf_tp_event_init(struct perf_event *event)
return 0;
}

+static int perf_probe_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_PROBE)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for probe events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_probe_init(event);
+ if (err)
+ return err;
+
+ event->destroy = perf_probe_destroy;
+
+ return 0;
+}
+
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,

@@ -7977,9 +7999,20 @@ static struct pmu perf_tracepoint = {
.read = perf_swevent_read,
};

+static struct pmu perf_probe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_probe_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static inline void perf_tp_register(void)
{
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+ perf_pmu_register(&perf_probe, "probe", PERF_TYPE_PROBE);
}

static void perf_event_free_filter(struct perf_event *event)
@@ -8061,7 +8094,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;

- if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ if (event->attr.type != PERF_TYPE_TRACEPOINT &&
+ event->attr.type != PERF_TYPE_PROBE)
return perf_event_set_bpf_handler(event, prog_fd);

if (event->tp_event->prog)
@@ -8533,8 +8567,9 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
char *filter_str;
int ret = -EINVAL;

- if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
- !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+ if (((event->attr.type != PERF_TYPE_TRACEPOINT &&
+ event->attr.type != PERF_TYPE_PROBE) ||
+ !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
!has_addr_filter(event))
return -EINVAL;

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3..bf9b99b 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include "trace.h"
+#include "trace_probe.h"

static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];

@@ -229,6 +230,74 @@ int perf_trace_init(struct perf_event *p_event)
return ret;
}

+#ifdef CONFIG_KPROBE_EVENTS
+static int perf_probe_create_kprobe(struct perf_event *p_event,
+ struct probe_desc *pd, char *name)
+{
+ struct trace_event_call *tp_event;
+ int ret;
+
+ tp_event = create_local_trace_kprobe(
+ name, (void *)(unsigned long)(pd->addr), pd->offset,
+ p_event->attr.is_return);
+ if (IS_ERR(tp_event))
+ return PTR_ERR(tp_event);
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ destroy_local_trace_kprobe(tp_event);
+
+ return ret;
+}
+#else
+static int perf_probe_create_kprobe(struct perf_event *p_event,
+ struct probe_desc *pd, char *name)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
+int perf_probe_init(struct perf_event *p_event)
+{
+ struct probe_desc pd;
+ int ret;
+ char *name = NULL;
+ __aligned_u64 aligned_probe_desc;
+
+ /*
+ * attr.probe_desc may not be 64-bit aligned on 32-bit systems.
+ * Make an aligned copy of it to before u64_to_user_ptr().
+ */
+ memcpy(&aligned_probe_desc, &p_event->attr.probe_desc,
+ sizeof(__aligned_u64));
+
+ if (copy_from_user(&pd, u64_to_user_ptr(aligned_probe_desc),
+ sizeof(struct probe_desc)))
+ return -EFAULT;
+
+ if (pd.func) {
+ name = kzalloc(MAX_PROBE_FUNC_NAME_LEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+ ret = strncpy_from_user(name, u64_to_user_ptr(pd.func),
+ MAX_PROBE_FUNC_NAME_LEN);
+ if (ret < 0)
+ goto out;
+
+ if (name[0] == '\0') {
+ kfree(name);
+ name = NULL;
+ }
+ }
+
+ if (!p_event->attr.is_uprobe)
+ ret = perf_probe_create_kprobe(p_event, &pd, name);
+ else
+ ret = -EOPNOTSUPP;
+out:
+ kfree(name);
+ return ret;
+}
+
void perf_trace_destroy(struct perf_event *p_event)
{
mutex_lock(&event_mutex);
@@ -237,6 +306,18 @@ void perf_trace_destroy(struct perf_event *p_event)
mutex_unlock(&event_mutex);
}

+void perf_probe_destroy(struct perf_event *p_event)
+{
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+
+ if (!p_event->attr.is_uprobe) {
+#ifdef CONFIG_KPROBE_EVENTS
+ destroy_local_trace_kprobe(p_event->tp_event);
+#endif
+ }
+}
+
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e1..16b334a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -438,6 +438,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
disable_kprobe(&tk->rp.kp);
wait = 1;
}
+
+ /*
+ * if tk is not added to any list, it must be a local trace_kprobe
+ * created with perf_event_open. We don't need to wait for these
+ * trace_kprobes
+ */
+ if (list_empty(&tk->list))
+ wait = 0;
out:
if (wait) {
/*
@@ -1315,12 +1323,9 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};

-static int register_kprobe_event(struct trace_kprobe *tk)
+static inline void init_trace_event_call(struct trace_kprobe *tk,
+ struct trace_event_call *call)
{
- struct trace_event_call *call = &tk->tp.call;
- int ret;
-
- /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
@@ -1329,6 +1334,19 @@ static int register_kprobe_event(struct trace_kprobe *tk)
call->event.funcs = &kprobe_funcs;
call->class->define_fields = kprobe_event_define_fields;
}
+
+ call->flags = TRACE_EVENT_FL_KPROBE;
+ call->class->reg = kprobe_register;
+ call->data = tk;
+}
+
+static int register_kprobe_event(struct trace_kprobe *tk)
+{
+ struct trace_event_call *call = &tk->tp.call;
+ int ret = 0;
+
+ init_trace_event_call(tk, call);
+
if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
ret = register_trace_event(&call->event);
@@ -1336,9 +1354,6 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = TRACE_EVENT_FL_KPROBE;
- call->class->reg = kprobe_register;
- call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n",
@@ -1360,6 +1375,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}

+#ifdef CONFIG_PERF_EVENTS
+/* create a trace_kprobe, but don't add it to global lists */
+struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+ bool is_return)
+{
+ struct trace_kprobe *tk;
+ int ret;
+ char *event;
+
+ /*
+ * local trace_kprobes are not added to probe_list, so they are never
+ * searched in find_trace_kprobe(). Therefore, there is no concern of
+ * duplicated name here.
+ */
+ event = func ? func : "DUMMY_EVENT";
+
+ tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func,
+ offs, 0 /* maxactive */, 0 /* nargs */,
+ is_return);
+
+ if (IS_ERR(tk)) {
+ pr_info("Failed to allocate trace_probe.(%d)\n",
+ (int)PTR_ERR(tk));
+ return ERR_CAST(tk);
+ }
+
+ init_trace_event_call(tk, &tk->tp.call);
+
+ if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = __register_trace_kprobe(tk);
+ if (ret < 0)
+ goto error;
+
+ return &tk->tp.call;
+error:
+ free_trace_kprobe(tk);
+ return ERR_PTR(ret);
+}
+
+void destroy_local_trace_kprobe(struct trace_event_call *event_call)
+{
+ struct trace_kprobe *tk;
+
+ tk = container_of(event_call, struct trace_kprobe, tp.call);
+
+ if (trace_probe_is_enabled(&tk->tp)) {
+ WARN_ON(1);
+ return;
+ }
+
+ __unregister_trace_kprobe(tk);
+ free_trace_kprobe(tk);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c..910ae1b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -411,3 +411,10 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
}

extern int set_print_fmt(struct trace_probe *tp, bool is_return);
+
+#ifdef CONFIG_PERF_EVENTS
+extern struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+ bool is_return);
+extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
+#endif
--
2.9.5