[PATCH 46/64] perf stat: Fix metrics calculation with event qualifiers

From: Arnaldo Carvalho de Melo
Date: Tue Apr 28 2015 - 09:34:59 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

Currently in perf IPC and other metrics cannot be directly shown
separately for both user and kernel in a single run. The problem was
that the metrics matching code did not check event qualifiers.

With this patch the following case works correctly.

% perf stat -e cycles:k,cycles:u,instructions:k,instructions:u true

Performance counter stats for 'true':

531,718 cycles:k
203,895 cycles:u
338,151 instructions:k # 0.64 insns per cycle
105,961 instructions:u # 0.52 insns per cycle

0.002989739 seconds time elapsed

Previously it would misreport the ratios because they were matching the
wrong value.

The patch is fairly big, but quite mechanic as it just adds context
indexes everywhere.

Reported-by: William Cohen <wcohen@xxxxxxxxxx>
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Acked-by: Namhyung Kim <namhyung@xxxxxxxxxx>
Cc: Andi Kleen <andi@xxxxxxxxxxxxxx>
Cc: David Ahern <dsahern@xxxxxxxxx>
Cc: Paul Mackerras <paulus@xxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: William Cohen <wcohen@xxxxxxxxxx>
Link: http://lkml.kernel.org/r/1428441919-23099-3-git-send-email-jolsa@xxxxxxxxxx
Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx>
Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
---
tools/perf/builtin-stat.c | 129 +++++++++++++++++++++++++++-------------------
1 file changed, 77 insertions(+), 52 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3dbd8c59efc5..52f433084779 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -247,21 +247,35 @@ out_free:
return -1;
}

+#define NUM_CTX 3
+
+enum { CTX_USER, CTX_KERNEL, CTX_ALL };
+
static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
-static struct stats runtime_branches_stats[MAX_NR_CPUS];
-static struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
-static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
static struct stats walltime_nsecs_stats;
-static struct stats runtime_transaction_stats[MAX_NR_CPUS];
-static struct stats runtime_elision_stats[MAX_NR_CPUS];
+static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+
+static int evsel_context(struct perf_evsel *evsel)
+{
+ if (evsel->attr.exclude_kernel)
+ return CTX_USER;
+ if (evsel->attr.exclude_user)
+ return CTX_KERNEL;
+ /* Handle hypervisor too? */
+ return CTX_ALL;
+}

static void perf_stat__reset_stats(struct perf_evlist *evlist)
{
@@ -356,37 +370,39 @@ static struct perf_evsel *nth_evsel(int n)
static void update_shadow_stats(struct perf_evsel *counter, u64 *count,
int cpu)
{
+ int ctx = evsel_context(counter);
+
if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
update_stats(&runtime_nsecs_stats[cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
- update_stats(&runtime_cycles_stats[cpu], count[0]);
+ update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
else if (transaction_run &&
perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
- update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]);
+ update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
else if (transaction_run &&
perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
- update_stats(&runtime_transaction_stats[cpu], count[0]);
+ update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
else if (transaction_run &&
perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
- update_stats(&runtime_elision_stats[cpu], count[0]);
+ update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
- update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]);
+ update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
- update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]);
+ update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
- update_stats(&runtime_branches_stats[cpu], count[0]);
+ update_stats(&runtime_branches_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
- update_stats(&runtime_cacherefs_stats[cpu], count[0]);
+ update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
- update_stats(&runtime_l1_dcache_stats[cpu], count[0]);
+ update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
- update_stats(&runtime_l1_icache_stats[cpu], count[0]);
+ update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
- update_stats(&runtime_ll_cache_stats[cpu], count[0]);
+ update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
- update_stats(&runtime_dtlb_cache_stats[cpu], count[0]);
+ update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
- update_stats(&runtime_itlb_cache_stats[cpu], count[0]);
+ update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
}

static void zero_per_pkg(struct perf_evsel *counter)
@@ -908,8 +924,9 @@ static void print_stalled_cycles_frontend(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_cycles_stats[cpu]);
+ total = avg_stats(&runtime_cycles_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -927,8 +944,9 @@ static void print_stalled_cycles_backend(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_cycles_stats[cpu]);
+ total = avg_stats(&runtime_cycles_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -946,8 +964,9 @@ static void print_branch_misses(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_branches_stats[cpu]);
+ total = avg_stats(&runtime_branches_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -965,8 +984,9 @@ static void print_l1_dcache_misses(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_l1_dcache_stats[cpu]);
+ total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -984,8 +1004,9 @@ static void print_l1_icache_misses(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_l1_icache_stats[cpu]);
+ total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -1003,8 +1024,9 @@ static void print_dtlb_cache_misses(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
+ total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -1022,8 +1044,9 @@ static void print_itlb_cache_misses(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_itlb_cache_stats[cpu]);
+ total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -1041,8 +1064,9 @@ static void print_ll_cache_misses(int cpu,
{
double total, ratio = 0.0;
const char *color;
+ int ctx = evsel_context(evsel);

- total = avg_stats(&runtime_ll_cache_stats[cpu]);
+ total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]);

if (total)
ratio = avg / total * 100.0;
@@ -1060,6 +1084,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
double sc = evsel->scale;
const char *fmt;
int cpu = cpu_map__id_to_cpu(id);
+ int ctx = evsel_context(evsel);

if (csv_output) {
fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s";
@@ -1091,15 +1116,15 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
return;

if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
- total = avg_stats(&runtime_cycles_stats[cpu]);
+ total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
if (total) {
ratio = avg / total;
fprintf(output, " # %5.2f insns per cycle ", ratio);
} else {
fprintf(output, " ");
}
- total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
- total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
+ total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
+ total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));

if (total && avg) {
ratio = total / avg;
@@ -1110,46 +1135,46 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
}

} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
- runtime_branches_stats[cpu].n != 0) {
+ runtime_branches_stats[ctx][cpu].n != 0) {
print_branch_misses(cpu, evsel, avg);
} else if (
evsel->attr.type == PERF_TYPE_HW_CACHE &&
evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D |
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
- runtime_l1_dcache_stats[cpu].n != 0) {
+ runtime_l1_dcache_stats[ctx][cpu].n != 0) {
print_l1_dcache_misses(cpu, evsel, avg);
} else if (
evsel->attr.type == PERF_TYPE_HW_CACHE &&
evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I |
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
- runtime_l1_icache_stats[cpu].n != 0) {
+ runtime_l1_icache_stats[ctx][cpu].n != 0) {
print_l1_icache_misses(cpu, evsel, avg);
} else if (
evsel->attr.type == PERF_TYPE_HW_CACHE &&
evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB |
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
- runtime_dtlb_cache_stats[cpu].n != 0) {
+ runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
print_dtlb_cache_misses(cpu, evsel, avg);
} else if (
evsel->attr.type == PERF_TYPE_HW_CACHE &&
evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB |
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
- runtime_itlb_cache_stats[cpu].n != 0) {
+ runtime_itlb_cache_stats[ctx][cpu].n != 0) {
print_itlb_cache_misses(cpu, evsel, avg);
} else if (
evsel->attr.type == PERF_TYPE_HW_CACHE &&
evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL |
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
- runtime_ll_cache_stats[cpu].n != 0) {
+ runtime_ll_cache_stats[ctx][cpu].n != 0) {
print_ll_cache_misses(cpu, evsel, avg);
} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
- runtime_cacherefs_stats[cpu].n != 0) {
- total = avg_stats(&runtime_cacherefs_stats[cpu]);
+ runtime_cacherefs_stats[ctx][cpu].n != 0) {
+ total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);

if (total)
ratio = avg * 100 / total;
@@ -1171,15 +1196,15 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
}
} else if (transaction_run &&
perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
- total = avg_stats(&runtime_cycles_stats[cpu]);
+ total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
if (total)
fprintf(output,
" # %5.2f%% transactional cycles ",
100.0 * (avg / total));
} else if (transaction_run &&
perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
- total = avg_stats(&runtime_cycles_stats[cpu]);
- total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+ total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+ total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
if (total2 < avg)
total2 = avg;
if (total)
@@ -1189,8 +1214,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
} else if (transaction_run &&
perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
avg > 0 &&
- runtime_cycles_in_tx_stats[cpu].n != 0) {
- total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+ runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+ total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);

if (total)
ratio = total / avg;
@@ -1199,8 +1224,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
} else if (transaction_run &&
perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
avg > 0 &&
- runtime_cycles_in_tx_stats[cpu].n != 0) {
- total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+ runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+ total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);

if (total)
ratio = total / avg;
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/