Re: [PATCH v2] perf stat: Show percore counts in per CPU output

From: Jiri Olsa
Date: Tue Feb 11 2020 - 18:30:12 EST


On Tue, Feb 11, 2020 at 10:34:34AM +0800, Jin Yao wrote:
> We have supported the event modifier "percore" which sums up the
> event counts for all hardware threads in a core and show the counts
> per core.
>
> For example,
>
> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
>
> Performance counter stats for 'system wide':
>
> S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
> S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
> S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
> S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
>
> This patch provides a new option "--percore-show-thread". It is
> used with event modifier "percore" together to sum up the event counts
> for all hardware threads in a core but show the counts per hardware
> thread.
>
> This is essentially a replacement for the any bit (which is gone in
> Icelake). Per core counts are useful for some formulas, e.g. CoreIPC.
> The original percore version was inconvenient to post process. This
> variant matches the output of the any bit.
>
> With this patch, for example,
>
> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
>
> Performance counter stats for 'system wide':
>
> CPU0 2,453,061 cpu/event=cpu-cycles,percore/
> CPU1 1,823,921 cpu/event=cpu-cycles,percore/
> CPU2 1,383,166 cpu/event=cpu-cycles,percore/
> CPU3 1,102,652 cpu/event=cpu-cycles,percore/
> CPU4 2,453,061 cpu/event=cpu-cycles,percore/
> CPU5 1,823,921 cpu/event=cpu-cycles,percore/
> CPU6 1,383,166 cpu/event=cpu-cycles,percore/
> CPU7 1,102,652 cpu/event=cpu-cycles,percore/
>
> We can see counts are duplicated in CPU pairs
> (CPU0/CPU4, CPU1/CPU5, CPU2/CPU6, CPU3/CPU7).
>
> v2:
> ---
> Add the explanation in change log. This is essentially a replacement
> for the any bit. No code change.

-I output is still wrong:

$ sudo ./perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -I 1000
# time CPU counts unit events
1.000251427 1.000251427 CPU0 43,474,700 cpu/event=cpu-cycles,percore/
1.000251427 1.000251427 CPU1 66,495,270 cpu/event=cpu-cycles,percore/
1.000251427 1.000251427 CPU2 42,342,367 cpu/event=cpu-cycles,percore/
1.000251427 1.000251427 CPU3 43,247,607 cpu/event=cpu-cycles,percore/

plus some comments below,
jirka


SNIP

> @@ -628,7 +628,7 @@ static void aggr_cb(struct perf_stat_config *config,
> static void print_counter_aggrdata(struct perf_stat_config *config,
> struct evsel *counter, int s,
> char *prefix, bool metric_only,
> - bool *first)
> + bool *first, int cpu)
> {
> struct aggr_data ad;
> FILE *output = config->output;
> @@ -654,8 +654,15 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
> fprintf(output, "%s", prefix);
>
> uval = val * counter->scale;
> - printout(config, id, nr, counter, uval, prefix,
> - run, ena, 1.0, &rt_stat);
> +
> + if (cpu == -1) {
> + printout(config, id, nr, counter, uval, prefix,
> + run, ena, 1.0, &rt_stat);
> + } else {
> + printout(config, cpu, nr, counter, uval, prefix,
> + run, ena, 1.0, &rt_stat);
> + }

this would be shorter instad of above:

printout(config, cpu != -1 ?: id, nr, counter, uval, prefix, run, ena, 1.0, &rt_stat);

> +
> if (!metric_only)
> fputc('\n', output);
> }
> @@ -687,7 +694,7 @@ static void print_aggr(struct perf_stat_config *config,
> evlist__for_each_entry(evlist, counter) {
> print_counter_aggrdata(config, counter, s,
> prefix, metric_only,
> - &first);
> + &first, -1);
> }
> if (metric_only)
> fputc('\n', output);
> @@ -1163,13 +1170,38 @@ static void print_percore(struct perf_stat_config *config,
>
> print_counter_aggrdata(config, counter, s,
> prefix, metric_only,
> - &first);
> + &first, -1);
> }
>
> if (metric_only)
> fputc('\n', output);
> }
>
> +static void print_percore_thread(struct perf_stat_config *config,
> + struct evsel *counter, char *prefix)
> +{
> + int cpu, s, s2, id;
> + bool first = true;
> + FILE *output = config->output;
> +

missing check for
if (!(config->aggr_map || config->aggr_get_id))


> + for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
> + s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);

should you use real cpu valu in here instead of an index? like value of:

perf_cpu_map__cpu(,evsel__cpus(counter), cpu)

instead of 'cpu' above

> +
> + for (s = 0; s < config->aggr_map->nr; s++) {
> + id = config->aggr_map->map[s];
> + if (s2 == id)
> + break;
> + }
> +
> + if (prefix)
> + fprintf(output, "%s", prefix);
> +
> + print_counter_aggrdata(config, counter, s,
> + prefix, false,
> + &first, cpu);
> + }
> +}
> +
> void
> perf_evlist__print_counters(struct evlist *evlist,
> struct perf_stat_config *config,
> @@ -1222,9 +1254,16 @@ perf_evlist__print_counters(struct evlist *evlist,
> print_no_aggr_metric(config, evlist, prefix);
> else {
> evlist__for_each_entry(evlist, counter) {
> - if (counter->percore)
> - print_percore(config, counter, prefix);
> - else
> + if (counter->percore) {
> + if (config->percore_show_thread) {
> + print_percore_thread(config,
> + counter,
> + prefix);
> + } else {
> + print_percore(config, counter,
> + prefix);

please keep the print_percore call in here and check/call
for the percore_show_thread in it

> + }
> + } else
> print_counter(config, counter, prefix);
> }
> }
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index fb990efa54a8..b4fdfaa7f2c0 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -109,6 +109,7 @@ struct perf_stat_config {
> bool walltime_run_table;
> bool all_kernel;
> bool all_user;
> + bool percore_show_thread;
> FILE *output;
> unsigned int interval;
> unsigned int timeout;
> --
> 2.17.1
>