Re: [RFC] perf: Add "-f" and "-F" flags to watch a "/sys" style file

From: Arnaldo Carvalho de Melo
Date: Mon May 15 2017 - 20:46:26 EST


Em Mon, May 15, 2017 at 12:27:03PM -0700, Luck, Tony escreveu:
> From: Tony Luck <tony.luck@xxxxxxxxx>
>
> Thomas Gleixner is encouraging us to extend the /sys/fs/resctrl file system
> to include monitoring data (LLC occupancy, memory bandwidth) from the
> (weird) counters that come as part of "Resource Director Technology".
> See Intel Software Developer Manual volume 3, section 17.17.1.
>
> Our current plan will provide readout files for each counter type in
> each of the existing "control" directories, also in the new "monitoring"
> directories that we plan to add.
>
> With this change, people are asking how this will be integrated with
> "perf" ... this patch represents a concept for how we might do this.
>
> Basically we teach perf how to read /sys style files (single number)
> that are dynamically updated. I just hacked in two new arguments
> that allow the user to name a file to be read and included in the
> output along with any normal events that they requested with "-e".
>
> There are two options because some files report a "snapshot" value
> that should be reported as-is, while other report an ever increasing
> value that the user most likely cares how much it changed from one
> timepoint to the next.
>
> cache occupancy fits the first category, while memory bandwidth (which
> actually reports total bytes since boot) fits the second.

I haven't been following the discussion about the resctrl fs discussion
to understand why those values couldn't be read via
sys_perf_event_open(), so can't comment on that, but the implementation
on the tools/ classes look nice, i.e. -e will get perf_evsel instances
that interface with the kernel via sys_perf_event_open(), -f/-F will get
perf_evsel instances that read values via the rsctrl file system, clean.

- Arnaldo

P.S. I may be slow to reply because I'm on vacation till the end of this
month.

> This may be useful in other contexts. There are many /sys files
> on my desktop that might also be interesting to monitor in this way.
> E.g.
> /sys/devices/pci0000:00/0000:00:1c.4/0000:03:00.0/net/enp3s0/statistics/tx_packets
> /sys/devices/pci0000:00/0000:00:1c.4/0000:03:00.0/net/enp3s0/statistics/tx_bytes
> /sys/fs/ext4/sda2/delayed_allocation_blocks
> /sys/fs/ext4/sda2/session_write_kbytes
> /sys/fs/ext4/sda2/lifetime_write_kbytes
> /sys/fs/ext4/sdb/delayed_allocation_blocks
> /sys/fs/ext4/sdb/session_write_kbytes
> /sys/fs/ext4/sdb/lifetime_write_kbytes
> /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp2_input
> /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp5_input
> /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input
> /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp4_input
>
> Thoughts? [Both on the concept, and on my hacky implementation]
>
> Cc: Vikas Shivappa <vikas.shivappa@xxxxxxxxx>
> Cc: Stephane Eranian <eranian@xxxxxxxxxx>
> Cc: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
> Cc: linux-kernel@xxxxxxxxxxxxxxx
>
> ---
> tools/perf/builtin-c2c.c | 2 ++
> tools/perf/builtin-mem.c | 2 ++
> tools/perf/builtin-record.c | 4 ++++
> tools/perf/builtin-stat.c | 4 ++++
> tools/perf/builtin-top.c | 4 ++++
> tools/perf/builtin-trace.c | 4 ++++
> tools/perf/util/evsel.c | 34 ++++++++++++++++++++++++++++------
> tools/perf/util/evsel.h | 1 +
> tools/perf/util/parse-events.c | 17 +++++++++++++++++
> tools/perf/util/parse-events.h | 1 +
> 10 files changed, 67 insertions(+), 6 deletions(-)
>
> diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
> index e33b4acece90..e0a0a668fa5b 100644
> --- a/tools/perf/builtin-c2c.c
> +++ b/tools/perf/builtin-c2c.c
> @@ -2691,6 +2691,8 @@ static int perf_c2c__record(int argc, const char **argv)
> OPT_CALLBACK('e', "event", &event_set, "event",
> "event selector. Use 'perf mem record -e list' to list available events",
> parse_record_events),
> + OPT_CALLBACK('f', "file", &event_set, "file", "file selector", parse_files_option),
> + OPT_CALLBACK('F', "file", &event_set, "file", "delta file selector", parse_files_option),
> OPT_BOOLEAN('u', "all-user", &all_user, "collect only user level data"),
> OPT_BOOLEAN('k', "all-kernel", &all_kernel, "collect only kernel level data"),
> OPT_UINTEGER('l', "ldlat", &perf_mem_events__loads_ldlat, "setup mem-loads latency"),
> diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
> index e001c0290793..86a332dd30cc 100644
> --- a/tools/perf/builtin-mem.c
> +++ b/tools/perf/builtin-mem.c
> @@ -72,6 +72,8 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
> OPT_CALLBACK('e', "event", &mem, "event",
> "event selector. use 'perf mem record -e list' to list available events",
> parse_record_events),
> + OPT_CALLBACK('f', "file", &mem, "file", "file selector", parse_files_option),
> + OPT_CALLBACK('F', "file", &mem, "file", "delta file selector", parse_files_option),
> OPT_UINTEGER(0, "ldlat", &perf_mem_events__loads_ldlat, "mem-loads latency"),
> OPT_INCR('v', "verbose", &verbose,
> "be more verbose (show counter open errors, etc)"),
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index ee7d0a82ccd0..024539e29437 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -1551,6 +1551,10 @@ static struct option __record_options[] = {
> OPT_CALLBACK('e', "event", &record.evlist, "event",
> "event selector. use 'perf list' to list available events",
> parse_events_option),
> + OPT_CALLBACK('f', "file", &record.evlist, "file",
> + "file selector", parse_files_option),
> + OPT_CALLBACK('F', "file", &record.evlist, "file",
> + "delta file selector", parse_files_option),
> OPT_CALLBACK(0, "filter", &record.evlist, "filter",
> "event filter", parse_filter),
> OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index a935b5023732..8bd10f7e0026 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -1722,6 +1722,10 @@ static const struct option stat_options[] = {
> OPT_CALLBACK('e', "event", &evsel_list, "event",
> "event selector. use 'perf list' to list available events",
> parse_events_option),
> + OPT_CALLBACK('f', "file", &evsel_list, "file",
> + "file selector", parse_files_option),
> + OPT_CALLBACK('F', "file", &evsel_list, "file",
> + "delta file selector", parse_files_option),
> OPT_CALLBACK(0, "filter", &evsel_list, "filter",
> "event filter", parse_filter),
> OPT_BOOLEAN('i', "no-inherit", &no_inherit,
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 7ab42b8311a1..beafd78217df 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -1104,6 +1104,10 @@ int cmd_top(int argc, const char **argv)
> OPT_CALLBACK('e', "event", &top.evlist, "event",
> "event selector. use 'perf list' to list available events",
> parse_events_option),
> + OPT_CALLBACK('f', "file", &top.evlist, "file",
> + "file selector", parse_files_option),
> + OPT_CALLBACK('F', "file", &top.evlist, "file",
> + "delta file selector", parse_files_option),
> OPT_U64('c', "count", &opts->user_interval, "event period to sample"),
> OPT_STRING('p', "pid", &target->pid, "pid",
> "profile events on existing process id"),
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index eaa66fb57347..9116d5d46056 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -2857,6 +2857,10 @@ int cmd_trace(int argc, const char **argv)
> OPT_CALLBACK('e', "event", &trace, "event",
> "event/syscall selector. use 'perf list' to list available events",
> trace__parse_events_option),
> + OPT_CALLBACK('f', "file", &evsel_list, "file",
> + "file selector", parse_files_option),
> + OPT_CALLBACK('F', "file", &evsel_list, "file",
> + "delta file selector", parse_files_option),
> OPT_BOOLEAN(0, "comm", &trace.show_comm,
> "show the thread COMM next to its id"),
> OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 0e879097adfb..381eb0c085b6 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1236,6 +1236,21 @@ void perf_counts_values__scale(struct perf_counts_values *count,
> *pscaled = scaled;
> }
>
> +static int read_sys_file(int fd, struct perf_counts_values *count)
> +{
> + char buf[100];
> + int n;
> + static u64 fake;
> +
> + n = pread(fd, buf, sizeof buf, 0);
> + if (n >= 0) {
> + count->val = n ? strtol(buf, NULL, 0) : 0;
> + count->ena = count->run = ++fake;
> + return 0;
> + } else
> + return -errno;
> +}
> +
> int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
> struct perf_counts_values *count)
> {
> @@ -1244,6 +1259,8 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
> if (FD(evsel, cpu, thread) < 0)
> return -EINVAL;
>
> + if (evsel->sysfile)
> + return read_sys_file(FD(evsel, cpu, thread), count);
> if (readn(FD(evsel, cpu, thread), count, sizeof(*count)) <= 0)
> return -errno;
>
> @@ -1539,18 +1556,23 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
> for (cpu = 0; cpu < cpus->nr; cpu++) {
>
> for (thread = 0; thread < nthreads; thread++) {
> - int fd, group_fd;
> + int fd;
>
> if (!evsel->cgrp && !evsel->system_wide)
> pid = thread_map__pid(threads, thread);
>
> - group_fd = get_group_fd(evsel, cpu, thread);
> + if (evsel->sysfile) {
> + fd = open(evsel->name, O_RDONLY, 0);
> + } else {
> + int group_fd;
> retry_open:
> - pr_debug2("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx",
> - pid, cpus->map[cpu], group_fd, flags);
> + group_fd = get_group_fd(evsel, cpu, thread);
> + pr_debug2("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx",
> + pid, cpus->map[cpu], group_fd, flags);
>
> - fd = sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu],
> - group_fd, flags);
> + fd = sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu],
> + group_fd, flags);
> + }
>
> FD(evsel, cpu, thread) = fd;
>
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index d101695c482c..ede30e111947 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -121,6 +121,7 @@ struct perf_evsel {
> bool per_pkg;
> bool precise_max;
> bool ignore_missing_thread;
> + bool sysfile;
> /* parse modifier helper */
> int exclude_GH;
> int nr_members;
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 01e779b91c8e..0d725eda476d 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -1793,6 +1793,23 @@ int parse_events_option(const struct option *opt, const char *str,
> return ret;
> }
>
> +int parse_files_option(const struct option *opt, const char *str,
> + int unset __maybe_unused)
> +{
> + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
> + struct perf_evsel *evsel = calloc(1, sizeof (*evsel));
> +
> + evsel->name = strdup(str);
> + evsel->unit = "";
> + evsel->sysfile = true;
> + evsel->snapshot = (opt->short_name == 'f');
> + evsel->scale = 1.0;
> + INIT_LIST_HEAD(&evsel->config_terms);
> + evsel->bpf_fd = -1;
> + perf_evlist__add(evlist, evsel);
> + return 0;
> +}
> +
> static int
> foreach_evsel_in_last_glob(struct perf_evlist *evlist,
> int (*func)(struct perf_evsel *evsel,
> diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> index a235f4d6d5e5..7e14a4c66fad 100644
> --- a/tools/perf/util/parse-events.h
> +++ b/tools/perf/util/parse-events.h
> @@ -30,6 +30,7 @@ bool have_tracepoints(struct list_head *evlist);
> const char *event_type(int type);
>
> int parse_events_option(const struct option *opt, const char *str, int unset);
> +int parse_files_option(const struct option *opt, const char *str, int unset);
> int parse_events(struct perf_evlist *evlist, const char *str,
> struct parse_events_error *error);
> int parse_events_terms(struct list_head *terms, const char *str);
> --
> 2.11.0