[PATCH v3 1/5] perf record off-cpu: Add direct off-cpu event

From: Howard Chu
Date: Fri Jul 26 2024 - 06:28:59 EST


Add direct off-cpu event called "offcpu-time-direct". Add a threshold to
dump direct off-cpu samples, "--off-cpu-thresh". Default value of
--off-cpu-thresh is UULONG_MAX(no direct off-cpu samples), and
--off-cpu-thresh's unit is milliseconds.

Bind fds and sample_id in off_cpu_start()

Note that we add "offcpu-time-direct" event using parse_event(), because we
need to make it no-inherit, otherwise perf_event_open() will fail.

Introduce sample_type_embed, indicating the sample_type of a sample
embedded in BPF output. More discussions in later patches.

Signed-off-by: Howard Chu <howardchu95@xxxxxxxxx>
Suggested-by: Ian Rogers <irogers@xxxxxxxxxx>
---
tools/perf/builtin-record.c | 2 ++
tools/perf/util/bpf_off_cpu.c | 53 ++++++++++++++++++++++++++++++++++-
tools/perf/util/off_cpu.h | 1 +
tools/perf/util/record.h | 1 +
4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a94516e8c522..708d48d309d6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -3325,6 +3325,7 @@ static struct record record = {
.ctl_fd = -1,
.ctl_fd_ack = -1,
.synth = PERF_SYNTH_ALL,
+ .off_cpu_thresh = ULLONG_MAX,
},
.tool = {
.sample = process_sample_event,
@@ -3557,6 +3558,7 @@ static struct option __record_options[] = {
"write collected trace data into several data files using parallel threads",
record__parse_threads),
OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
+ OPT_U64(0, "off-cpu-thresh", &record.opts.off_cpu_thresh, "time threshold(in ms) for dumping off-cpu events"),
OPT_END()
};

diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index 6af36142dc5a..905a11c96c5b 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -13,6 +13,7 @@
#include "util/cgroup.h"
#include "util/strlist.h"
#include <bpf/bpf.h>
+#include <internal/xyarray.h>

#include "bpf_skel/off_cpu.skel.h"

@@ -45,10 +46,12 @@ static int off_cpu_config(struct evlist *evlist)
.size = sizeof(attr), /* to capture ABI version */
};
char *evname = strdup(OFFCPU_EVENT);
+ char off_cpu_direct_event[64];

if (evname == NULL)
return -ENOMEM;

+ /* off-cpu event in the end */
evsel = evsel__new(&attr);
if (!evsel) {
free(evname);
@@ -65,12 +68,22 @@ static int off_cpu_config(struct evlist *evlist)
free(evsel->name);
evsel->name = evname;

+ /* direct off-cpu event */
+ snprintf(off_cpu_direct_event, sizeof(off_cpu_direct_event), "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT_DIRECT);
+ if (parse_event(evlist, off_cpu_direct_event)) {
+ pr_err("Failed to open off-cpu event\n");
+ return -1;
+ }
+
return 0;
}

static void off_cpu_start(void *arg)
{
struct evlist *evlist = arg;
+ struct evsel *evsel;
+ struct perf_cpu pcpu;
+ int i, err;

/* update task filter for the given workload */
if (!skel->bss->has_cpu && !skel->bss->has_task &&
@@ -86,6 +99,27 @@ static void off_cpu_start(void *arg)
bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
}

+ /* sample id and fds in BPF's perf_event_array can only be set after record__open() */
+ evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT_DIRECT);
+ if (evsel == NULL) {
+ pr_err("%s evsel not found\n", OFFCPU_EVENT_DIRECT);
+ return;
+ }
+
+ if (evsel->core.id)
+ skel->bss->sample_id = evsel->core.id[0];
+
+ perf_cpu_map__for_each_cpu(pcpu, i, evsel->core.cpus) {
+ err = bpf_map__update_elem(skel->maps.offcpu_output,
+ &pcpu.cpu, sizeof(__u32),
+ xyarray__entry(evsel->core.fd, pcpu.cpu, 0),
+ sizeof(__u32), BPF_ANY);
+ if (err) {
+ pr_err("Failed to update perf event map for direct off-cpu dumping\n");
+ return;
+ }
+ }
+
skel->bss->enabled = 1;
}

@@ -130,14 +164,24 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
{
int err, fd, i;
int ncpus = 1, ntasks = 1, ncgrps = 1;
+ __u64 offcpu_thresh;
struct strlist *pid_slist = NULL;
struct str_node *pos;
+ struct evsel *evsel;

if (off_cpu_config(evlist) < 0) {
pr_err("Failed to config off-cpu BPF event\n");
return -1;
}

+ evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT_DIRECT);
+ if (evsel == NULL) {
+ pr_err("%s evsel not found\n", OFFCPU_EVENT_DIRECT);
+ return -1 ;
+ }
+
+ evsel->sample_type_embed = OFFCPU_SAMPLE_TYPES;
+
skel = off_cpu_bpf__open();
if (!skel) {
pr_err("Failed to open off-cpu BPF skeleton\n");
@@ -250,7 +294,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
}

if (evlist__first(evlist)->cgrp) {
- struct evsel *evsel;
u8 val = 1;

skel->bss->has_cgroup = 1;
@@ -272,6 +315,14 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
}
}

+ offcpu_thresh = opts->off_cpu_thresh;
+
+ if (opts->off_cpu_thresh != ULLONG_MAX)
+ offcpu_thresh = opts->off_cpu_thresh * 1000000; /* off-cpu-thresh is in ms */
+
+ skel->bss->offcpu_thresh = offcpu_thresh;
+ skel->bss->sample_type = OFFCPU_SAMPLE_TYPES;
+
err = off_cpu_bpf__attach(skel);
if (err) {
pr_err("Failed to attach off-cpu BPF skeleton\n");
diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
index 2dd67c60f211..a349f8e300e0 100644
--- a/tools/perf/util/off_cpu.h
+++ b/tools/perf/util/off_cpu.h
@@ -9,6 +9,7 @@ struct perf_session;
struct record_opts;

#define OFFCPU_EVENT "offcpu-time"
+#define OFFCPU_EVENT_DIRECT "offcpu-time-direct"

#define OFFCPU_SAMPLE_TYPES (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP | \
PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index a6566134e09e..3c11416e6627 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -79,6 +79,7 @@ struct record_opts {
int synth;
int threads_spec;
const char *threads_user_spec;
+ u64 off_cpu_thresh;
};

extern const char * const *record_usage;
--
2.45.2