Re: [PATCH v3 1/5] perf record off-cpu: Add direct off-cpu event

From: Ian Rogers
Date: Fri Jul 26 2024 - 19:49:18 EST


On Fri, Jul 26, 2024 at 3:28 AM Howard Chu <howardchu95@xxxxxxxxx> wrote:
>
> Add direct off-cpu event called "offcpu-time-direct". Add a threshold to
> dump direct off-cpu samples, "--off-cpu-thresh". Default value of
> --off-cpu-thresh is UULONG_MAX(no direct off-cpu samples), and
> --off-cpu-thresh's unit is milliseconds.
>
> Bind fds and sample_id in off_cpu_start()
>
> Note that we add "offcpu-time-direct" event using parse_event(), because we
> need to make it no-inherit, otherwise perf_event_open() will fail.
>
> Introduce sample_type_embed, indicating the sample_type of a sample
> embedded in BPF output. More discussions in later patches.
>
> Signed-off-by: Howard Chu <howardchu95@xxxxxxxxx>
> Suggested-by: Ian Rogers <irogers@xxxxxxxxxx>
> ---
> tools/perf/builtin-record.c | 2 ++
> tools/perf/util/bpf_off_cpu.c | 53 ++++++++++++++++++++++++++++++++++-
> tools/perf/util/off_cpu.h | 1 +
> tools/perf/util/record.h | 1 +
> 4 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index a94516e8c522..708d48d309d6 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -3325,6 +3325,7 @@ static struct record record = {
> .ctl_fd = -1,
> .ctl_fd_ack = -1,
> .synth = PERF_SYNTH_ALL,
> + .off_cpu_thresh = ULLONG_MAX,
> },
> .tool = {
> .sample = process_sample_event,
> @@ -3557,6 +3558,7 @@ static struct option __record_options[] = {
> "write collected trace data into several data files using parallel threads",
> record__parse_threads),
> OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
> + OPT_U64(0, "off-cpu-thresh", &record.opts.off_cpu_thresh, "time threshold(in ms) for dumping off-cpu events"),
> OPT_END()
> };
>
> diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
> index 6af36142dc5a..905a11c96c5b 100644
> --- a/tools/perf/util/bpf_off_cpu.c
> +++ b/tools/perf/util/bpf_off_cpu.c
> @@ -13,6 +13,7 @@
> #include "util/cgroup.h"
> #include "util/strlist.h"
> #include <bpf/bpf.h>
> +#include <internal/xyarray.h>
>
> #include "bpf_skel/off_cpu.skel.h"
>
> @@ -45,10 +46,12 @@ static int off_cpu_config(struct evlist *evlist)
> .size = sizeof(attr), /* to capture ABI version */
> };
> char *evname = strdup(OFFCPU_EVENT);
> + char off_cpu_direct_event[64];
>
> if (evname == NULL)
> return -ENOMEM;
>
> + /* off-cpu event in the end */
> evsel = evsel__new(&attr);
> if (!evsel) {
> free(evname);
> @@ -65,12 +68,22 @@ static int off_cpu_config(struct evlist *evlist)
> free(evsel->name);
> evsel->name = evname;
>
> + /* direct off-cpu event */
> + snprintf(off_cpu_direct_event, sizeof(off_cpu_direct_event), "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT_DIRECT);
> + if (parse_event(evlist, off_cpu_direct_event)) {
> + pr_err("Failed to open off-cpu event\n");
> + return -1;
> + }
> +
> return 0;
> }
>
> static void off_cpu_start(void *arg)
> {
> struct evlist *evlist = arg;
> + struct evsel *evsel;
> + struct perf_cpu pcpu;
> + int i, err;
>
> /* update task filter for the given workload */
> if (!skel->bss->has_cpu && !skel->bss->has_task &&
> @@ -86,6 +99,27 @@ static void off_cpu_start(void *arg)
> bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
> }
>
> + /* sample id and fds in BPF's perf_event_array can only be set after record__open() */
> + evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT_DIRECT);
> + if (evsel == NULL) {
> + pr_err("%s evsel not found\n", OFFCPU_EVENT_DIRECT);
> + return;
> + }
> +
> + if (evsel->core.id)
> + skel->bss->sample_id = evsel->core.id[0];
> +
> + perf_cpu_map__for_each_cpu(pcpu, i, evsel->core.cpus) {
> + err = bpf_map__update_elem(skel->maps.offcpu_output,
> + &pcpu.cpu, sizeof(__u32),
> + xyarray__entry(evsel->core.fd, pcpu.cpu, 0),
> + sizeof(__u32), BPF_ANY);
> + if (err) {
> + pr_err("Failed to update perf event map for direct off-cpu dumping\n");
> + return;
> + }
> + }
> +
> skel->bss->enabled = 1;
> }
>
> @@ -130,14 +164,24 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
> {
> int err, fd, i;
> int ncpus = 1, ntasks = 1, ncgrps = 1;
> + __u64 offcpu_thresh;
> struct strlist *pid_slist = NULL;
> struct str_node *pos;
> + struct evsel *evsel;
>
> if (off_cpu_config(evlist) < 0) {
> pr_err("Failed to config off-cpu BPF event\n");
> return -1;
> }
>
> + evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT_DIRECT);
> + if (evsel == NULL) {
> + pr_err("%s evsel not found\n", OFFCPU_EVENT_DIRECT);
> + return -1 ;
> + }
> +
> + evsel->sample_type_embed = OFFCPU_SAMPLE_TYPES;
> +
> skel = off_cpu_bpf__open();
> if (!skel) {
> pr_err("Failed to open off-cpu BPF skeleton\n");
> @@ -250,7 +294,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
> }
>
> if (evlist__first(evlist)->cgrp) {
> - struct evsel *evsel;
> u8 val = 1;
>
> skel->bss->has_cgroup = 1;
> @@ -272,6 +315,14 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target,
> }
> }
>
> + offcpu_thresh = opts->off_cpu_thresh;
> +
> + if (opts->off_cpu_thresh != ULLONG_MAX)
> + offcpu_thresh = opts->off_cpu_thresh * 1000000; /* off-cpu-thresh is in ms */

nit: In this comment, it's not clear if you are referring to the
option or the variable. In modern languages it is usual to have some
kind of "duration" type. As we're using u64s I'd be tempted to add a
"_ms" suffix, just to make clear what the units for the time are. I
think that'd make this:
offcpu_thresh_ms = opts->off_cpu_thresh_ns * 1000000

Thanks,
Ian

> +
> + skel->bss->offcpu_thresh = offcpu_thresh;
> + skel->bss->sample_type = OFFCPU_SAMPLE_TYPES;
> +
> err = off_cpu_bpf__attach(skel);
> if (err) {
> pr_err("Failed to attach off-cpu BPF skeleton\n");
> diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
> index 2dd67c60f211..a349f8e300e0 100644
> --- a/tools/perf/util/off_cpu.h
> +++ b/tools/perf/util/off_cpu.h
> @@ -9,6 +9,7 @@ struct perf_session;
> struct record_opts;
>
> #define OFFCPU_EVENT "offcpu-time"
> +#define OFFCPU_EVENT_DIRECT "offcpu-time-direct"
>
> #define OFFCPU_SAMPLE_TYPES (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP | \
> PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
> diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
> index a6566134e09e..3c11416e6627 100644
> --- a/tools/perf/util/record.h
> +++ b/tools/perf/util/record.h
> @@ -79,6 +79,7 @@ struct record_opts {
> int synth;
> int threads_spec;
> const char *threads_user_spec;
> + u64 off_cpu_thresh;
> };
>
> extern const char * const *record_usage;
> --
> 2.45.2
>