[PATCH 23/24] perf stat: Introduce --per-thread option

From: Arnaldo Carvalho de Melo
Date: Fri Jun 26 2015 - 11:49:13 EST


From: Jiri Olsa <jolsa@xxxxxxxxxx>

Currently all the -p option PID arguments tasks values get aggregated
and printed as single values.

Adding --per-tasks option to print values per task.

$ perf stat -e cycles,instructions --per-thread -p 30190,30242
^C
Performance counter stats for process id '30190,30242':

cat-30190 0 cycles
yes-30242 3,842,525,421 cycles
cat-30190 0 instructions
yes-30242 10,370,817,010 instructions

1.143155657 seconds time elapsed

Also works under interval mode:

$ perf stat -e cycles,instructions --per-thread -p 30190,30242 -I 1000
# time comm-pid counts unit events
1.000073435 cat-30190 89,058 cycles
1.000073435 yes-30242 3,360,786,902 cycles (100.00%)
1.000073435 cat-30190 14,066 instructions
1.000073435 yes-30242 9,069,937,462 instructions
2.000204830 cat-30190 0 cycles
2.000204830 yes-30242 3,351,667,626 cycles
2.000204830 cat-30190 0 instructions
2.000204830 yes-30242 9,045,796,885 instructions
^C 2.771286639 cat-30190 0 cycles
2.771286639 yes-30242 2,593,884,166 cycles
2.771286639 cat-30190 0 instructions
2.771286639 yes-30242 7,001,171,191 instructions

It works only with -t and -p options, otherwise following error is
printed:

$ perf stat -e cycles --per-thread -I 1000 ls
The --per-thread option is only available when monitoring via -p -t options.
-p, --pid <pid> stat events on existing process id
-t, --tid <tid> stat events on existing thread id

Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx>
Tested-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Cc: Adrian Hunter <adrian.hunter@xxxxxxxxx>
Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Cc: David Ahern <dsahern@xxxxxxxxx>
Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Stephane Eranian <eranian@xxxxxxxxxx>
Link: http://lkml.kernel.org/r/1435310967-14570-23-git-send-email-jolsa@xxxxxxxxxx
Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
---
tools/perf/Documentation/perf-stat.txt | 4 ++
tools/perf/builtin-stat.c | 76 +++++++++++++++++++++++++++++++++-
tools/perf/util/stat.h | 1 +
3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 04e150d83e7d..47469abdcc1c 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -144,6 +144,10 @@ is a useful mode to detect imbalance between physical cores. To enable this mod
use --per-core in addition to -a. (system-wide). The output includes the
core number and the number of online logical processors on that physical processor.

+--per-thread::
+Aggregate counts per monitored threads, when monitoring threads (-t option)
+or processes (-p option).
+
-D msecs::
--delay msecs::
After starting the program, wait msecs before measuring. This is useful to
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 56dc8881cb05..37e301a32f43 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -231,6 +231,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread,
count = &zero;

switch (aggr_mode) {
+ case AGGR_THREAD:
case AGGR_CORE:
case AGGR_SOCKET:
case AGGR_NONE:
@@ -602,6 +603,14 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
csv_output ? 0 : -4,
perf_evsel__cpus(evsel)->map[id], csv_sep);
break;
+ case AGGR_THREAD:
+ fprintf(output, "%*s-%*d%s",
+ csv_output ? 0 : 16,
+ thread_map__comm(evsel->threads, id),
+ csv_output ? 0 : -8,
+ thread_map__pid(evsel->threads, id),
+ csv_sep);
+ break;
case AGGR_GLOBAL:
default:
break;
@@ -750,6 +759,40 @@ static void print_aggr(char *prefix)
}
}

+static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
+{
+ int nthreads = thread_map__nr(counter->threads);
+ int ncpus = cpu_map__nr(counter->cpus);
+ int cpu, thread;
+ double uval;
+
+ for (thread = 0; thread < nthreads; thread++) {
+ u64 ena = 0, run = 0, val = 0;
+
+ for (cpu = 0; cpu < ncpus; cpu++) {
+ val += perf_counts(counter->counts, cpu, thread)->val;
+ ena += perf_counts(counter->counts, cpu, thread)->ena;
+ run += perf_counts(counter->counts, cpu, thread)->run;
+ }
+
+ if (prefix)
+ fprintf(output, "%s", prefix);
+
+ uval = val * counter->scale;
+
+ if (nsec_counter(counter))
+ nsec_printout(thread, 0, counter, uval);
+ else
+ abs_printout(thread, 0, counter, uval);
+
+ if (!csv_output)
+ print_noise(counter, 1.0);
+
+ print_running(run, ena);
+ fputc('\n', output);
+ }
+}
+
/*
* Print out the results of a single counter:
* aggregated counts in system-wide mode
@@ -876,6 +919,9 @@ static void print_interval(char *prefix, struct timespec *ts)
case AGGR_NONE:
fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit");
break;
+ case AGGR_THREAD:
+ fprintf(output, "# time comm-pid counts %*s events\n", unit_width, "unit");
+ break;
case AGGR_GLOBAL:
default:
fprintf(output, "# time counts %*s events\n", unit_width, "unit");
@@ -944,6 +990,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
case AGGR_SOCKET:
print_aggr(prefix);
break;
+ case AGGR_THREAD:
+ evlist__for_each(evsel_list, counter)
+ print_aggr_thread(counter, prefix);
+ break;
case AGGR_GLOBAL:
evlist__for_each(evsel_list, counter)
print_counter_aggr(counter, prefix);
@@ -1031,6 +1081,7 @@ static int perf_stat_init_aggr_mode(void)
break;
case AGGR_NONE:
case AGGR_GLOBAL:
+ case AGGR_THREAD:
default:
break;
}
@@ -1255,6 +1306,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
"aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-core", &aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE),
+ OPT_SET_UINT(0, "per-thread", &aggr_mode,
+ "aggregate counts per thread", AGGR_THREAD),
OPT_UINTEGER('D', "delay", &initial_delay,
"ms to wait before starting measurement after program start"),
OPT_END()
@@ -1346,8 +1399,19 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
run_count = 1;
}

- /* no_aggr, cgroup are for system-wide only */
- if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) &&
+ if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
+ fprintf(stderr, "The --per-thread option is only available "
+ "when monitoring via -p -t options.\n");
+ parse_options_usage(NULL, options, "p", 1);
+ parse_options_usage(NULL, options, "t", 1);
+ goto out;
+ }
+
+ /*
+ * no_aggr, cgroup are for system-wide only
+ * --per-thread is aggregated per thread, we dont mix it with cpu mode
+ */
+ if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) &&
!target__has_cpu(&target)) {
fprintf(stderr, "both cgroup and no-aggregation "
"modes only available in system-wide mode\n");
@@ -1375,6 +1439,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
}
goto out;
}
+
+ /*
+ * Initialize thread_map with comm names,
+ * so we could print it out on output.
+ */
+ if (aggr_mode == AGGR_THREAD)
+ thread_map__read_comms(evsel_list->threads);
+
if (interval && interval < 100) {
pr_err("print interval must be >= 100ms\n");
parse_options_usage(stat_usage, options, "I", 1);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 9f05c571befe..1cfbe0a980ac 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -30,6 +30,7 @@ enum aggr_mode {
AGGR_GLOBAL,
AGGR_SOCKET,
AGGR_CORE,
+ AGGR_THREAD,
};

struct perf_counts_values {
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/