Re: [RFC PATCH v2 3/3] perf evsel: Add retirement latency event support

From: Namhyung Kim
Date: Mon Apr 29 2024 - 17:31:41 EST


On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
>
> When a retirement latency event is processed it sets a flag on the
> evsel. This change makes it so that when the flag is set evsel
> opening, reading and exiting report values from child perf record and
> perf report processes.
>
> Something similar was suggested by Namhyung Kim in:
> https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69+rD06RAnu9-EQ@xxxxxxxxxxxxxx/
>
> This is trying to add support for retirement latency directly in
> events rather than through metric changes, as suggested by Weilin Wang in:
> https://lore.kernel.org/lkml/20240402214436.1409476-1-weilin.wang@xxxxxxxxx/

This seems to create perf record + report pair for each R event
while Weilin's patch handled multiple events at once.

>
> Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
> ---
> tools/perf/util/evsel.c | 186 +++++++++++++++++++++++++++++++++++++++-
> tools/perf/util/evsel.h | 3 +
> 2 files changed, 188 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index a0a8aee7d6b9..17c123cddde3 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -22,6 +22,7 @@
> #include <sys/resource.h>
> #include <sys/types.h>
> #include <dirent.h>
> +#include <signal.h>
> #include <stdlib.h>
> #include <perf/evsel.h>
> #include "asm/bug.h"
> @@ -58,6 +59,7 @@
> #include <internal/xyarray.h>
> #include <internal/lib.h>
> #include <internal/threadmap.h>
> +#include <subcmd/run-command.h>
>
> #include <linux/ctype.h>
>
> @@ -493,6 +495,162 @@ struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx)
> }
> #endif
>
> +static int evsel__start_retire_latency_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> + int cpu_map_idx)
> +{
> + char buf[16];
> + int pipefd[2];
> + int err, i;
> + struct perf_cpu cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
> + struct child_process *child_record =
> + xyarray__entry(evsel->children, cpu_map_idx, 0);
> + struct child_process *child_report =
> + xyarray__entry(evsel->children, cpu_map_idx, 1);
> + char *event = strdup(evsel__name(evsel));
> + /* TODO: the dummy event also won't be used, but there's no option to disable. */
> + const char *record_argv[15] = {
> + [0] = "perf",
> + [1] = "record",
> + [2] = "--synth=no",
> + [3] = "--no-bpf-event",
> + [4] = "-W",
> + [5] = "-o",
> + [6] = "-",
> + [7] = "-e",
> + };
> + const char *report_argv[] = {
> + [0] = "perf",
> + [1] = "report",
> + [2] = "-i",
> + [3] = "-",
> + [4] = "-q",
> + [5] = "-F",
> + [6] = "retire_lat",
> + NULL,
> + };
> +
> + if (evsel->core.attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + if (!event)
> + return -ENOMEM;
> +
> + /* Change the R modifier to the maximum precision one. */
> + for (i = strlen(event) - 1; i > 0; i--) {
> + if (event[i] == 'R')
> + break;
> + if (event[i] == ':' || event[i] == '/')
> + i = 0;
> + }
> + if (i <= 0) {
> + pr_err("Expected retired latency 'R'\n");
> + return -EINVAL;
> + }
> + event[i] = 'P';
> +
> + i = 8;
> + record_argv[i++] = event;
> + if (verbose) {
> + record_argv[i++] = verbose > 1 ? "-vv" : "-v";
> + }
> + if (cpu.cpu >= 0) {
> + record_argv[i++] = "-C";
> + snprintf(buf, sizeof(buf), "%d", cpu.cpu);
> + record_argv[i++] = buf;
> + } else {
> + record_argv[i++] = "-a";
> + }
> + /* TODO: use something like the control fds to control perf record behavior. */
> + record_argv[i++] = "sleep";
> + record_argv[i++] = "0.1";

This might be too short and the record process can exit
before perf report (but I guess it's fine when using a pipe).
Also I'm not sure if it's ok to get the retire latency of 100 ms
regardless of the execution of the given workload.

> +
> + if (pipe(pipefd) < 0) {
> + free(event);
> + return -errno;
> + }
> +
> + child_record->argv = record_argv;
> + child_record->pid = -1;
> + child_record->no_stdin = 1;
> + if (verbose)
> + child_record->err = fileno(stderr);
> + else
> + child_record->no_stderr = 1;
> + child_record->out = pipefd[1];
> + err = start_command(child_record);
> + free(event);
> + if (err)
> + return err;
> +
> + child_report->argv = report_argv;
> + child_report->pid = -1;
> + if (verbose)
> + child_report->err = fileno(stderr);
> + else
> + child_report->no_stderr = 1;
> + child_report->in = pipefd[0];
> + child_report->out = -1;
> + return start_command(child_report);
> +}
> +
> +static int evsel__finish_retire_latency_cpu(struct evsel *evsel, int cpu_map_idx)
> +{
> + struct child_process *child_record =
> + xyarray__entry(evsel->children, cpu_map_idx, 0);
> + struct child_process *child_report =
> + xyarray__entry(evsel->children, cpu_map_idx, 1);
> +
> + if (child_record->pid > 0)
> + finish_command(child_record);
> + if (child_report->pid > 0)
> + finish_command(child_report);
> + return 0;
> +}
> +
> +static int evsel__read_retire_latency(struct evsel *evsel, int cpu_map_idx, int thread)
> +{
> + struct child_process *child_record = xyarray__entry(evsel->children, cpu_map_idx, 0);
> + struct child_process *child_report = xyarray__entry(evsel->children, cpu_map_idx, 1);
> + struct perf_counts_values *count = perf_counts(evsel->counts, cpu_map_idx, thread);
> + char buf[256];
> + int err;
> +
> + kill(child_record->pid, SIGTERM);
> + err = read(child_report->out, buf, sizeof(buf));
> + if (err < 0 || strlen(buf) == 0)
> + return -1;
> +
> + count->val = atoll(buf);
> + count->ena = 1;
> + count->run = 1;
> + count->id = 0;
> + count->lost = 0;
> +
> + /*
> + * TODO: The SIGCHLD from the children exiting will cause interval mode
> + * to stop, use the control fd to avoid this.
> + */
> + err = evsel__finish_retire_latency_cpu(evsel, cpu_map_idx);
> + if (err)
> + return err;
> +
> + /* Restart the counter. */
> + return evsel__start_retire_latency_cpu(evsel, evsel->core.cpus, cpu_map_idx);

Is this for the interval mode? Then I think it's better to move the
logic there and let this code just do the 'read'.


> +}
> +
> +static int evsel__finish_retire_latency(struct evsel *evsel)
> +{
> + int idx;
> +
> + perf_cpu_map__for_each_idx(idx, evsel->core.cpus) {
> + int err = evsel__finish_retire_latency_cpu(evsel, idx);
> +
> + if (err)
> + return err;
> + }
> + return 0;
> +}
> +
> const char *const evsel__hw_names[PERF_COUNT_HW_MAX] = {
> "cycles",
> "instructions",
> @@ -1465,6 +1623,10 @@ static void evsel__free_config_terms(struct evsel *evsel)
>
> void evsel__exit(struct evsel *evsel)
> {
> + if (evsel->children) {
> + evsel__finish_retire_latency(evsel);
> + zfree(&evsel->children);

You'd better call xyarray__delete() and set it to NULL.

Thanks,
Namhyung

> + }
> assert(list_empty(&evsel->core.node));
> assert(evsel->evlist == NULL);
> bpf_counter__destroy(evsel);
> @@ -1778,6 +1940,9 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> if (evsel__is_tool(evsel))
> return evsel__read_tool(evsel, cpu_map_idx, thread);
>
> + if (evsel->retire_lat)
> + return evsel__read_retire_latency(evsel, cpu_map_idx, thread);
> +
> if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> return evsel__read_group(evsel, cpu_map_idx, thread);
>
> @@ -1993,10 +2158,22 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
> threads = empty_thread_map;
> }
>
> - if (evsel->core.fd == NULL &&
> + if (!evsel->retire_lat && evsel->core.fd == NULL &&
> perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
> return -ENOMEM;
>
> + if (evsel->retire_lat && evsel->children == NULL) {
> + /*
> + * Use ylen of 2, [0] is the record and [1] is the report
> + * command. Currently retirement latency doesn't support
> + * per-thread mode.
> + */
> + evsel->children = xyarray__new(perf_cpu_map__nr(cpus), /*ylen=*/2,
> + sizeof(struct child_process));
> + if (!evsel->children)
> + return -ENOMEM;
> + }
> +
> evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
> if (evsel->cgrp)
> evsel->open_flags |= PERF_FLAG_PID_CGROUP;
> @@ -2209,6 +2386,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
>
> for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
>
> + if (evsel->retire_lat) {
> + err = evsel__start_retire_latency_cpu(evsel, cpus, idx);
> + if (err)
> + goto out_close;
> + continue;
> + }
> +
> for (thread = 0; thread < nthreads; thread++) {
> int fd, group_fd;
> retry_open:
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index bd8e84954e34..3c0806436e64 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -177,6 +177,9 @@ struct evsel {
> __u64 start_time;
> /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> bool pid_stat;
> +
> + /* Used for retire_lat child process. */
> + struct xyarray *children;
> };
>
> struct perf_missing_features {
> --
> 2.44.0.769.g3c40516874-goog
>