[PATCH v3 perf, bpf-next 1/4] perf, bpf: Introduce PERF_RECORD_BPF_EVENT

From: Song Liu
Date: Tue Dec 11 2018 - 18:34:20 EST

Next message: Song Liu: "[PATCH v3 perf, bpf-next 0/4] reveal invisible bpf programs"
Previous message: Song Liu: "[PATCH v3 perf, bpf-next 2/4] perf: sync tools/include/uapi/linux/perf_event.h"
In reply to: Song Liu: "[PATCH v3 perf, bpf-next 2/4] perf: sync tools/include/uapi/linux/perf_event.h"
Next in thread: Peter Zijlstra: "Re: [PATCH v3 perf, bpf-next 1/4] perf, bpf: Introduce PERF_RECORD_BPF_EVENT"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

For better performance analysis of BPF programs, this patch introduces
PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
load/unload information to user space.

Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
The following example shows kernel symbols for a BPF program with 7
sub programs:

ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi

Note that these sub programs are not allocated in contiguous memory
ranges. Instead, each of them occupies separate page(s). The starting
address of these sub programs are randomized within the page(s) for
security reasons.

The following data structure is used for PERF_RECORD_BPF_EVENT. It is
generated for each _sub program_:

/*
* Record different types of bpf events:
* enum perf_bpf_event_type {
* PERF_BPF_EVENT_UNKNOWN = 0,
* PERF_BPF_EVENT_PROG_LOAD = 1,
* PERF_BPF_EVENT_PROG_UNLOAD = 2,
* };
*
* struct {
* struct perf_event_header header;
* u32 type;
* u32 flags;
* u32 id; // prog_id or other id
* u32 sub_id; // subprog id
*
* // for bpf_prog types, bpf prog or subprog
* u8 tag[BPF_TAG_SIZE];
* u64 addr;
* u64 len;
* char name[];
* struct sample_id sample_id;
* };
*/

This data is designed for different use cases:

1. For simple perf profiling, addr, len, and name[] works similar to
PERF_RECORD_MMAP. These raw records are stored in perf.data file.
2. For perf annotation and other cases that needs more details of the
BPF program, id and sub_id are used to extract detailed information
of the prog through sys_bpf(BPF_OBJ_GET_INFO_BY_FD). User space
tools are responsible to save the detailed information properly, as
these information will not be available after the bpf program is
unloaded.

This follows the existing perf model of keeping the ordered records
with enough information for profiling while keeping keys for reliably
finding extra, more voluminous information for further analysis, like
raw jitted binaries augmented with line numbers that can be used for
disassembly, annotation, etc

Currently, PERF_RECORD_BPF_EVENT only support two events:
PERF_BPF_EVENT_PROG_LOAD and PERF_BPF_EVENT_PROG_UNLOAD. But it can be
easily extended to support more events.

Signed-off-by: Song Liu <songliubraving@xxxxxx>
Reviewed-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
---
include/linux/filter.h | 7 ++
include/linux/perf_event.h | 10 +++
include/uapi/linux/perf_event.h | 35 ++++++++-
kernel/bpf/core.c | 2 +-
kernel/bpf/syscall.c | 3 +
kernel/events/core.c | 123 +++++++++++++++++++++++++++++++-
6 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 537e9e7c6e6f..45d23560f90b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -955,6 +955,7 @@ bpf_address_lookup(unsigned long addr, unsigned long *size,

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym);

#else /* CONFIG_BPF_JIT */

@@ -1010,6 +1011,12 @@ static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}
+
+static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+ sym[0] = '\0';
+}
+
#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 53c500f0ca79..02217bab64d0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1113,6 +1113,12 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
}

extern void perf_event_mmap(struct vm_area_struct *vma);
+
+#ifdef CONFIG_BPF_SYSCALL
+extern void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
+ struct bpf_prog *prog);
+#endif
+
extern struct perf_guest_info_callbacks *perf_guest_cbs;
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1333,6 +1339,10 @@ static inline int perf_unregister_guest_info_callbacks
(struct perf_guest_info_callbacks *callbacks) { return 0; }

static inline void perf_event_mmap(struct vm_area_struct *vma) { }
+#ifdef CONFIG_BPF_SYSCALL
+static inline void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
+ struct bpf_prog *prog) { }
+#endif
static inline void perf_event_exec(void) { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
static inline void perf_event_namespaces(struct task_struct *tsk) { }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9de8780ac8d9..0ae3dae55fa8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -372,7 +372,8 @@ struct perf_event_attr {
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
namespaces : 1, /* include namespaces data */
- __reserved_1 : 35;
+ bpf_event : 1, /* include bpf events */
+ __reserved_1 : 34;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -965,9 +966,41 @@ enum perf_event_type {
*/
PERF_RECORD_NAMESPACES = 16,

+ /*
+ * Record different types of bpf events:
+ * enum perf_bpf_event_type {
+ * PERF_BPF_EVENT_UNKNOWN = 0,
+ * PERF_BPF_EVENT_PROG_LOAD = 1,
+ * PERF_BPF_EVENT_PROG_UNLOAD = 2,
+ * };
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u32 type;
+ * u32 flags;
+ * u32 id; // prog_id or other id
+ * u32 sub_id; // subprog id
+ *
+ * // for bpf_prog types, bpf prog or subprog
+ * u8 tag[BPF_TAG_SIZE];
+ * u64 addr;
+ * u64 len;
+ * char name[];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_BPF_EVENT = 17,
+
PERF_RECORD_MAX, /* non-ABI */
};

+enum perf_bpf_event_type {
+ PERF_BPF_EVENT_UNKNOWN = 0,
+ PERF_BPF_EVENT_PROG_LOAD = 1,
+ PERF_BPF_EVENT_PROG_UNLOAD = 2,
+ PERF_BPF_EVENT_MAX, /* non-ABI */
+};
+
#define PERF_MAX_STACK_DEPTH 127
#define PERF_MAX_CONTEXTS_PER_STACK 8

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5cdd8da0e7f2..2a8364294f11 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -496,7 +496,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
*symbol_end = addr + hdr->pages * PAGE_SIZE;
}

-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;
const struct btf_type *type;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 496c2f695f2d..a97a3698b7a4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1210,6 +1210,8 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ perf_event_bpf_event_prog(PERF_BPF_EVENT_PROG_UNLOAD, prog);
+
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del_all(prog);
@@ -1558,6 +1560,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
}

bpf_prog_kallsyms_add(prog);
+ perf_event_bpf_event_prog(PERF_BPF_EVENT_PROG_LOAD, prog);
return err;

free_used_maps:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84530ab358c3..4b2d1a58d3fe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -385,6 +385,7 @@ static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -4235,7 +4236,7 @@ static bool is_sb_event(struct perf_event *event)

if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
- attr->task ||
+ attr->task || attr->bpf_event ||
attr->context_switch)
return true;
return false;
@@ -4305,6 +4306,8 @@ static void unaccount_event(struct perf_event *event)
dec = true;
if (has_branch_stack(event))
dec = true;
+ if (event->attr.bpf_event)
+ atomic_dec(&nr_bpf_events);

if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7650,6 +7653,122 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}

+/*
+ * bpf load/unload tracking
+ */
+
+struct perf_bpf_event {
+ struct bpf_prog *prog;
+
+ struct {
+ struct perf_event_header header;
+ u32 type;
+ u32 flags;
+ u32 id;
+ u32 sub_id;
+ u8 tag[BPF_TAG_SIZE];
+ u64 addr;
+ u64 len;
+ } event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+ return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_bpf_event *bpf_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ char name[KSYM_NAME_LEN];
+ int name_len;
+ int ret;
+
+ if (!perf_event_bpf_match(event))
+ return;
+
+ /* get prog name and round up to 64 bit aligned */
+ bpf_get_prog_name(bpf_event->prog, name);
+ name_len = strlen(name) + 1;
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
+ name[name_len++] = '\0';
+ bpf_event->event_id.header.size += name_len;
+
+ perf_event_header__init_id(&bpf_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ bpf_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, bpf_event->event_id);
+
+ __output_copy(&handle, name, name_len);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_bpf(struct perf_bpf_event *bpf_event)
+{
+ perf_iterate_sb(perf_event_bpf_output,
+ bpf_event,
+ NULL);
+}
+
+static void perf_event_bpf_event_subprog(
+ enum perf_bpf_event_type type,
+ struct bpf_prog *prog, u32 id, u32 sub_id)
+{
+ struct perf_bpf_event bpf_event = (struct perf_bpf_event){
+ .prog = prog,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_BPF_EVENT,
+ .size = sizeof(bpf_event.event_id),
+ },
+ .type = type,
+ /* .flags = 0 */
+ .id = id,
+ .sub_id = sub_id,
+ .addr = (u64)(unsigned long)prog->bpf_func,
+ .len = prog->jited_len,
+ },
+ };
+
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+ perf_event_bpf(&bpf_event);
+}
+
+/*
+ * This is call per bpf_prog. In case of multiple sub programs,
+ * this function calls perf_event_bpf_event_subprog() multiple times
+ */
+void perf_event_bpf_event_prog(enum perf_bpf_event_type type,
+ struct bpf_prog *prog)
+{
+ if (!atomic_read(&nr_bpf_events))
+ return;
+
+ if (type != PERF_BPF_EVENT_PROG_LOAD &&
+ type != PERF_BPF_EVENT_PROG_UNLOAD)
+ return;
+
+ if (prog->aux->func_cnt == 0) {
+ perf_event_bpf_event_subprog(type, prog,
+ prog->aux->id, 0);
+ } else {
+ int i;
+
+ for (i = 0; i < prog->aux->func_cnt; i++)
+ perf_event_bpf_event_subprog(type, prog->aux->func[i],
+ prog->aux->id, i);
+ }
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -9900,6 +10019,8 @@ static void account_event(struct perf_event *event)
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (event->attr.bpf_event)
+ atomic_inc(&nr_bpf_events);

if (inc) {
/*
--
2.17.1

Next message: Song Liu: "[PATCH v3 perf, bpf-next 0/4] reveal invisible bpf programs"
Previous message: Song Liu: "[PATCH v3 perf, bpf-next 2/4] perf: sync tools/include/uapi/linux/perf_event.h"
In reply to: Song Liu: "[PATCH v3 perf, bpf-next 2/4] perf: sync tools/include/uapi/linux/perf_event.h"
Next in thread: Peter Zijlstra: "Re: [PATCH v3 perf, bpf-next 1/4] perf, bpf: Introduce PERF_RECORD_BPF_EVENT"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]