[RFC PATCH v3 2/2] bpf: Introduce function for outputing data to perf event

From: He Kuang
Date: Tue Jul 07 2015 - 07:44:13 EST


There're scenarios that we need an eBPF program to record not only
kprobe point args, but also the PMU counters, time latencies or cache
miss numbers between two probe points and other information we can
get when the probe point is entered.

This helper function gives eBPF program ability to output data as perf
sample event. The function works as kprobe_perf_func(), it packets the
data from bpf stack space into a sample record and submits it to the
ring-buffer of perf_events which are binded to BPF ftrace
entry. Userspace perf tools can record BPF ftrace event to collect
those records.

Signed-off-by: He Kuang <hekuang@xxxxxxxxxx>
---
include/uapi/linux/bpf.h | 3 +++
kernel/trace/bpf_trace.c | 43 +++++++++++++++++++++++++++++++++++++++++++
kernel/trace/trace.h | 5 +++++
samples/bpf/bpf_helpers.h | 2 ++
4 files changed, 53 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9ebdf5..f44b0aa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -210,6 +210,9 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_l4_csum_replace,
+
+ /* int bpf_output_data(void *src, int size, void *regs) */
+ BPF_FUNC_output_data,
__BPF_FUNC_MAX_ID,
};

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce5..45dbeab 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -79,6 +79,47 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};

+static u64 bpf_output_data(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *src = (void *) (long) r1;
+ int dsize = (int) r2, __size, size;
+ void *regs = (void *) (long) r3;
+ struct bpf_trace_entry_head *entry;
+ struct hlist_head *head;
+ int rctx;
+
+ if (dsize > TRACE_BPF_MAX_SIZE)
+ return -ENOMEM;
+
+ head = this_cpu_ptr(event_bpf.perf_events);
+ if (hlist_empty(head))
+ return -ENOENT;
+
+ __size = sizeof(*entry) + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_prepare(size, TRACE_BPF, NULL, &rctx);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->size = dsize;
+ memcpy(&entry[1], src, dsize);
+
+ perf_tp_event(0, 1, entry, size, regs, head, rctx, NULL);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_output_data_proto = {
+ .func = bpf_output_data,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_PTR_TO_CTX,
+};
+
static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
/* NMI safe access to clock monotonic */
@@ -170,6 +211,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_map_delete_elem_proto;
case BPF_FUNC_probe_read:
return &bpf_probe_read_proto;
+ case BPF_FUNC_output_data:
+ return &bpf_output_data_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d135f55..8d9100d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -113,6 +113,11 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};

+struct bpf_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long size;
+};
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f960b5f..bc7f13c 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -49,5 +49,7 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag
(void *) BPF_FUNC_l3_csum_replace;
static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) =
(void *) BPF_FUNC_l4_csum_replace;
+static int (*bpf_output_data)(void *src, int size, void *regs) =
+ (void *) BPF_FUNC_output_data;

#endif
--
1.8.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/