Re: [RFC PATCH v2 1/3] perf evsel: Refactor tool events

From: Namhyung Kim
Date: Tue Apr 30 2024 - 15:01:05 EST


On Mon, Apr 29, 2024 at 7:47 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
>
> On Mon, Apr 29, 2024 at 2:00 PM Namhyung Kim <namhyung@xxxxxxxxxx> wrote:
> >
> > Hi Ian,
> >
> > On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
> > >
> > > Tool events unnecessarily open a dummy perf event which is useless
> > > even with `perf record` which will still open a dummy event. Change
> > > the behavior of tool events so:
> >
> > Right, I was surprised that we need a dummy event for that.
> >
> > >
> > > - duration_time - call `rdclock` on open and then report the count as
> > > a delta since the start in evsel__read_counter. This moves code out
> > > of builtin-stat making it more general purpose.
> > >
> > > - user_time/system_time - open the fd as either `/proc/pid/stat` or
> > > `/proc/stat` for cases like system wide. evsel__read_counter will
> > > read the appropriate field out of the procfs file. These values
> > > were previously supplied by wait4, if the procfs read fails then
> > > the wait4 values are used, assuming the process/thread terminated.
> > > By reading user_time and system_time this way, interval mode can be
> > > supported.
> >
> > Good improvement!
> >
> > >
> > > Opening any of the tool events for `perf record` returns invalid.
> >
> > Ok, I think those tool events are for `perf stat` only.
> >
> > But I feel like this change is a separate optimization and can be
> > independently submitted (from the retire-latency or tPEBS change).
>
> You're right, but as both pieces of code are changing the
> evsel__read_counter implementation I'd kept them together.

Ok.

>
> > >
> > > Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
> > > ---
> > > tools/perf/builtin-stat.c | 74 ++++++-------
> > > tools/perf/util/evsel.c | 223 +++++++++++++++++++++++++++++++++++++-
> > > tools/perf/util/evsel.h | 6 +
> > > 3 files changed, 259 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > index 65a3dd7ffac3..428e9721b908 100644
> > > --- a/tools/perf/builtin-stat.c
> > > +++ b/tools/perf/builtin-stat.c
> > > @@ -255,45 +255,37 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
> > > process_synthesized_event, NULL);
> > > }
> > >
> > > -static int read_single_counter(struct evsel *counter, int cpu_map_idx,
> > > - int thread, struct timespec *rs)
> > > -{
> > > - switch(counter->tool_event) {
> > > - case PERF_TOOL_DURATION_TIME: {
> > > - u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
> > > - struct perf_counts_values *count =
> > > - perf_counts(counter->counts, cpu_map_idx, thread);
> > > - count->ena = count->run = val;
> > > - count->val = val;
> > > - return 0;
> > > - }
> > > - case PERF_TOOL_USER_TIME:
> > > - case PERF_TOOL_SYSTEM_TIME: {
> > > - u64 val;
> > > - struct perf_counts_values *count =
> > > - perf_counts(counter->counts, cpu_map_idx, thread);
> > > - if (counter->tool_event == PERF_TOOL_USER_TIME)
> > > - val = ru_stats.ru_utime_usec_stat.mean;
> > > - else
> > > - val = ru_stats.ru_stime_usec_stat.mean;
> > > - count->ena = count->run = val;
> > > - count->val = val;
> > > - return 0;
> > > - }
> > > - default:
> > > - case PERF_TOOL_NONE:
> > > - return evsel__read_counter(counter, cpu_map_idx, thread);
> > > - case PERF_TOOL_MAX:
> > > - /* This should never be reached */
> > > - return 0;
> > > +static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread)
> > > +{
> > > + int err = evsel__read_counter(counter, cpu_map_idx, thread);
> > > +
> > > + /*
> > > + * Reading user and system time will fail when the process
> > > + * terminates. Use the wait4 values in that case.
> > > + */
> > > + if (err &&
> > > + (counter->tool_event == PERF_TOOL_USER_TIME ||
> > > + counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
> >
> > Off-topic. I feel like we should have a (fake) pmu for the tool events
> > rather than the evsel->tool_event enum.
>
> Sounds good to me. The refactoring in this (unmerged) test change:
> https://lore.kernel.org/lkml/20240429200225.1271876-4-irogers@xxxxxxxxxx/
> makes it so we can load a PMU from something like a temporary
> directory. It would be nice to get rid of the fake_pmu special case in
> parse-events and similarly the tool event special case. A problem is
> that libperf lacks knowledge of fake_pmus and tool events in the
> evlist and evsel code. libperf can try to apply operations to these
> events even though the only knowledge of them is in the perf tool.

Right, libperf should be aware of the tool pmu then.

>
> >
> > > + u64 val;
> > > + struct perf_counts_values *count =
> > > + perf_counts(counter->counts, cpu_map_idx, thread);
> > > +
> > > + if (counter->tool_event == PERF_TOOL_USER_TIME)
> > > + val = ru_stats.ru_utime_usec_stat.mean;
> > > + else
> > > + val = ru_stats.ru_stime_usec_stat.mean;
> > > + count->ena = count->run = val;
> > > + count->val = val;
> > > + return 0;
> > > }
> > > + return err;
> > > }
> > >
> > > /*
> > > * Read out the results of a single counter:
> > > * do not aggregate counts across CPUs in system-wide mode
> > > */
> > > -static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
> > > +static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
> > > {
> > > int nthreads = perf_thread_map__nr(evsel_list->core.threads);
> > > int thread;
> > > @@ -311,7 +303,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> > > * (via evsel__read_counter()) and sets their count->loaded.
> > > */
> > > if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
> > > - read_single_counter(counter, cpu_map_idx, thread, rs)) {
> > > + read_single_counter(counter, cpu_map_idx, thread)) {
> > > counter->counts->scaled = -1;
> > > perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
> > > perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
> > > @@ -340,7 +332,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> > > return 0;
> > > }
> > >
> > > -static int read_affinity_counters(struct timespec *rs)
> > > +static int read_affinity_counters(void)
> > > {
> > > struct evlist_cpu_iterator evlist_cpu_itr;
> > > struct affinity saved_affinity, *affinity;
> > > @@ -361,10 +353,8 @@ static int read_affinity_counters(struct timespec *rs)
> > > if (evsel__is_bpf(counter))
> > > continue;
> > >
> > > - if (!counter->err) {
> > > - counter->err = read_counter_cpu(counter, rs,
> > > - evlist_cpu_itr.cpu_map_idx);
> > > - }
> > > + if (!counter->err)
> > > + counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
> > > }
> > > if (affinity)
> > > affinity__cleanup(&saved_affinity);
> > > @@ -388,11 +378,11 @@ static int read_bpf_map_counters(void)
> > > return 0;
> > > }
> > >
> > > -static int read_counters(struct timespec *rs)
> > > +static int read_counters(void)
> > > {
> > > if (!stat_config.stop_read_counter) {
> > > if (read_bpf_map_counters() ||
> > > - read_affinity_counters(rs))
> > > + read_affinity_counters())
> > > return -1;
> > > }
> > > return 0;
> > > @@ -423,7 +413,7 @@ static void process_interval(void)
> > >
> > > evlist__reset_aggr_stats(evsel_list);
> > >
> > > - if (read_counters(&rs) == 0)
> > > + if (read_counters() == 0)
> > > process_counters();
> > >
> > > if (STAT_RECORD) {
> > > @@ -911,7 +901,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > > * avoid arbitrary skew, we must read all counters before closing any
> > > * group leaders.
> > > */
> > > - if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
> > > + if (read_counters() == 0)
> > > process_counters();
> > >
> > > /*
> >
> > I think this part can be a separate commit. You can implement the
> > tool event handling before actually using it. :)
>
> It's possible but it opens a 3rd world where tool code is implemented
> both in evsel and also in builtin-stat. The builtin-stat code uses
> wait4 and the evsel code is using file descriptors. If tests fail
> should the 3rd world be fixed? The intent here is that we're deleting
> all the builtin-stat evsel tool code, a consequence of that is
> deleting unused function arguments. As the changes are purely deleting
> I'd prefer not to have a combined world to temporarily maintain.

I guess the tool events are only used by perf stat, is there any
tests that use the tool events?

Anyway it's not a big deal. I just wanted to keep each change
small. I think it's ok to add the code for tool event to evsel as
long as it's not used. But I don't insist on it strongly.

>
> >
> > > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> > > index 3536404e9447..a0a8aee7d6b9 100644
> > > --- a/tools/perf/util/evsel.c
> > > +++ b/tools/perf/util/evsel.c
> > > @@ -10,6 +10,7 @@
> > > #include <errno.h>
> > > #include <inttypes.h>
> > > #include <linux/bitops.h>
> > > +#include <api/io.h>
> > > #include <api/fs/fs.h>
> > > #include <api/fs/tracing_path.h>
> > > #include <linux/hw_breakpoint.h>
> > > @@ -30,6 +31,7 @@
> > > #include "counts.h"
> > > #include "event.h"
> > > #include "evsel.h"
> > > +#include "time-utils.h"
> > > #include "util/env.h"
> > > #include "util/evsel_config.h"
> > > #include "util/evsel_fprintf.h"
> > > @@ -1600,11 +1602,183 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
> > > return evsel__process_group_data(leader, cpu_map_idx, thread, data);
> > > }
> > >
> > > +static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
> > > +{
> > > + char buf[256];
> > > + struct io io;
> > > + int c, i;
> > > +
> > > + io__init(&io, fd, buf, sizeof(buf));
> > > +
> > > + /* Skip lines to relevant CPU. */
> > > + for (i = -1; i < cpu.cpu; i++) {
> > > + do {
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + } while (c != '\n');
> > > + }
> > > + /* Skip to "cpu". */
> > > + c = io__get_char(&io);
> > > + if (c != 'c')
> > > + return -EINVAL;
> > > + c = io__get_char(&io);
> > > + if (c != 'p')
> > > + return -EINVAL;
> > > + c = io__get_char(&io);
> > > + if (c != 'u')
> > > + return -EINVAL;
> > > + /* Skip N of cpuN. */
> > > + do {
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + } while (c != ' ');
> > > +
> > > + i = 1;
> > > + while (true) {
> > > + c = io__get_dec(&io, val);
> > > + if (c != ' ')
> > > + break;
> > > + if (field == i)
> > > + return 0;
> > > + i++;
> > > + }
> > > + return -EINVAL;
> > > +}
> >
> > Looks somewhat verbose, can we have something like
> > io__get_str_field() or io__get_num_filed()?
>
> I think macro magic would probably be the best way to cut down on
> verbosity, I'm not sure its a big deal given the majority of the field
> handling happens in the while loops. I'd like to skip this for now.

It's not a big deal. But I just thought this can be generic and
live in the tools/lib. :)

>
> >
> > > +
> > > +static int read_pid_stat_field(int fd, int field, __u64 *val)
> > > +{
> > > + char buf[256];
> > > + struct io io;
> > > + int c, i;
> > > +
> > > + io__init(&io, fd, buf, sizeof(buf));
> > > + c = io__get_dec(&io, val);
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > + if (field == 1)
> > > + return 0;
> > > +
> > > + /* Skip comm. */
> > > + c = io__get_char(&io);
> > > + if (c != '(')
> > > + return -EINVAL;
> > > + do {
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + } while (c != ')');

Hmm.. but the possible space in the comm would make it
complicated. :(


> > > + if (field == 2)
> > > + return -EINVAL;
> > > +
> > > + /* Skip state */
> > > + c = io__get_char(&io);
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + if (field == 3)
> > > + return -EINVAL;
> > > +
> > > + /* Loop over numeric fields*/
> > > + c = io__get_char(&io);
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > +
> > > + i = 4;
> > > + while (true) {
> > > + c = io__get_dec(&io, val);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + if (c == -2) {
> > > + /* Assume a -ve was read */
> > > + c = io__get_dec(&io, val);
> > > + *val *= -1;
> > > + }
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > + if (field == i)
> > > + return 0;
> > > + i++;
> > > + }
> > > + return -EINVAL;
> > > +}
> > > +
> > > +static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
> > > +{
> > > + __u64 cur_time, delta_start;
> > > + int fd, err = 0;
> > > + struct perf_counts_values *count;
> > > + bool adjust = false;
> > > +
> > > + count = perf_counts(evsel->counts, cpu_map_idx, thread);
> > > +
> > > + switch (evsel->tool_event) {
> > > + case PERF_TOOL_DURATION_TIME:
> > > + /*
> > > + * Pretend duration_time is only on the first CPU and thread, or
> > > + * else aggregation will scale duration_time by the number of
> > > + * CPUs/threads.
> > > + */
> >
> > We could set evsel->pmu->cpus to 0, if there's a tool pmu.
>
> For the CPU it already is:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/parse-events.c#n314
> There isn't an equivalent for the thread. I was tempted to add an
> assert that cpu_map_idx == 0, but it seemed more forgiving to treat it
> how threads are handled - I'm always concerned about breaking existing
> code given how often I'm successful at doing it :-) . We could also
> try to divide the delta across all CPUs/threads, but that seemed
> weird.

Yeah, I think it's best if we keep a single event (cpu = -1, thread = 0 ?)
Then it needs to set the cpu/thread map regardless of user options.

>
> >
> > > + if (cpu_map_idx == 0 && thread == 0)
> > > + cur_time = rdclock();
> > > + else
> > > + cur_time = evsel->start_time;
> > > + break;
> > > + case PERF_TOOL_USER_TIME:
> > > + case PERF_TOOL_SYSTEM_TIME: {
> > > + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> > > +
> > > + fd = FD(evsel, cpu_map_idx, thread);
> > > + lseek(fd, SEEK_SET, 0);
> > > + if (evsel->pid_stat) {
> > > + /* The event exists solely on 1 CPU. */
> >
> > Probably the same.
>
> Similar to above. In system wide we can give per-CPU user and system
> time breakdowns, whereas the /prod/pid/stat version doesn't support
> that. The code is trying to make the most out of the information /proc
> will give it. Thinking about it, we shouldn't be pinning the user and
> system time to CPU0 in that case, so add_event_tool:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/parse-events.c#n302
> should pass "0" in the duration_time case and NULL for the others.
> This should allow say "perf stat -A -I 1000 -e user_time,system_time"
> to give a per second per CPU user and system time value. This reminds
> me that it'd be nice to do similar for hwmon values, another fake PMU
> case.

That makes sense. Actually I'm thinking if we can make the
cpu-clock and task-clock events accept the k and u modifiers.
But it'd require programming the timers on every kernel/user
context switch and everyone would hate it. ;-)

Thanks,
Namhyung

> >
> >
> > > + if (cpu_map_idx == 0)
> > > + err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
> > > + else
> > > + cur_time = 0;
> > > + } else {
> > > + /* The event is for all threads. */
> > > + if (thread == 0) {
> > > + struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
> > > + cpu_map_idx);
> > > +
> > > + err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
> > > + } else {
> > > + cur_time = 0;
> > > + }
> > > + }
> > > + adjust = true;
> > > + break;
> > > + }
> > > + case PERF_TOOL_NONE:
> > > + case PERF_TOOL_MAX:
> > > + default:
> > > + err = -EINVAL;
> > > + }
> > > + if (err)
> > > + return err;
> > > +
> > > + delta_start = cur_time - evsel->start_time;
> > > + if (adjust) {
> > > + __u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
> > > +
> > > + delta_start *= 1000000000 / ticks_per_sec;
> > > + }
> > > + count->val = delta_start;
> > > + count->ena = count->run = delta_start;
> > > + count->lost = 0;
> > > + return 0;
> > > +}
> > > +
> > > int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> > > {
> > > - u64 read_format = evsel->core.attr.read_format;
> > > + if (evsel__is_tool(evsel))
> > > + return evsel__read_tool(evsel, cpu_map_idx, thread);
> > >
> > > - if (read_format & PERF_FORMAT_GROUP)
> > > + if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> > > return evsel__read_group(evsel, cpu_map_idx, thread);
> > >
> > > return evsel__read_one(evsel, cpu_map_idx, thread);
> > > @@ -2005,6 +2179,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > > int pid = -1, err, old_errno;
> > > enum rlimit_action set_rlimit = NO_CHANGE;
> > >
> > > + if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
> > > + if (evsel->core.attr.sample_period) /* no sampling */
> > > + return -EINVAL;
> > > + evsel->start_time = rdclock();
> > > + return 0;
> > > + }
> > > +
> > > err = __evsel__prepare_open(evsel, cpus, threads);
> > > if (err)
> > > return err;
> > > @@ -2037,6 +2218,44 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > > if (!evsel->cgrp && !evsel->core.system_wide)
> > > pid = perf_thread_map__pid(threads, thread);
> > >
> > > + if (evsel->tool_event == PERF_TOOL_USER_TIME ||
> > > + evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
> > > + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> > > +
> > > + if (evsel->core.attr.sample_period) {
> > > + /* no sampling */
> > > + err = -EINVAL;
> > > + goto out_close;
> > > + }
> > > + if (pid > -1) {
> > > + char buf[64];
> > > +
> > > + snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
> > > + fd = open(buf, O_RDONLY);
> > > + evsel->pid_stat = true;
> > > + } else {
> > > + fd = open("/proc/stat", O_RDONLY);
> > > + }
> > > + FD(evsel, idx, thread) = fd;
> > > + if (fd < 0) {
> > > + err = -errno;
> > > + goto out_close;
> > > + }
> > > + if (pid > -1) {
> > > + err = read_pid_stat_field(fd, system ? 15 : 14,
> > > + &evsel->start_time);
> > > + } else {
> > > + struct perf_cpu cpu;
> > > +
> > > + cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
> > > + err = read_stat_field(fd, cpu, system ? 3 : 1,
> > > + &evsel->start_time);
> > > + }
> > > + if (err)
> > > + goto out_close;
> > > + continue;
> > > + }
> > > +
> > > group_fd = get_group_fd(evsel, idx, thread);
> > >
> > > if (group_fd == -2) {
> > > diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> > > index 517cff431de2..43f6fd1dcb4d 100644
> > > --- a/tools/perf/util/evsel.h
> > > +++ b/tools/perf/util/evsel.h
> > > @@ -170,6 +170,12 @@ struct evsel {
> > >
> > > /* for missing_features */
> > > struct perf_pmu *pmu;
> > > +
> > > + /* For tool events */
> > > + /* Beginning time subtracted when the counter is read. */
> > > + __u64 start_time;
> > > + /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> > > + bool pid_stat;
> > > };
> > >
> > > struct perf_missing_features {
> > > --
> > > 2.44.0.769.g3c40516874-goog
> > >