[PATCH v2 2/3] perf metrics: Compute unmerged uncore metrics individually

From: Ian Rogers
Date: Mon Feb 05 2024 - 23:32:26 EST


When merging counts from multiple uncore PMUs the metric is only
computed for the metric leader. When merging/aggregation is disabled,
prior to this patch just the leader's metric would be computed. Fix
this by computing the metric for each PMU.

On a SkylakeX:
Before:
```
$ perf stat -A -M memory_bandwidth_total -a sleep 1

Performance counter stats for 'system wide':

CPU0 82,217 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 9.2 MB/s memory_bandwidth_total
CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 0.0 MB/s memory_bandwidth_total
CPU0 61,395 UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_1]
CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_1]
CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU0 81,570 UNC_M_CAS_COUNT.RD [uncore_imc_2]
CPU18 113,886 UNC_M_CAS_COUNT.RD [uncore_imc_2]
CPU0 62,330 UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU18 66,942 UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU0 75,489 UNC_M_CAS_COUNT.RD [uncore_imc_3]
CPU18 27,958 UNC_M_CAS_COUNT.RD [uncore_imc_3]
CPU0 55,864 UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU18 38,727 UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_4]
CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_4]
CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU0 75,423 UNC_M_CAS_COUNT.RD [uncore_imc_5]
CPU18 104,527 UNC_M_CAS_COUNT.RD [uncore_imc_5]
CPU0 57,596 UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU18 56,777 UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU0 1,003,440,851 ns duration_time

1.003440851 seconds time elapsed
```

After:
```
$ perf stat -A -M memory_bandwidth_total -a sleep 1

Performance counter stats for 'system wide':

CPU0 88,968 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 9.5 MB/s memory_bandwidth_total
CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 0.0 MB/s memory_bandwidth_total
CPU0 59,498 UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_1] # 0.0 MB/s memory_bandwidth_total
CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_1] # 0.0 MB/s memory_bandwidth_total
CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU0 88,635 UNC_M_CAS_COUNT.RD [uncore_imc_2] # 9.5 MB/s memory_bandwidth_total
CPU18 117,975 UNC_M_CAS_COUNT.RD [uncore_imc_2] # 11.5 MB/s memory_bandwidth_total
CPU0 60,829 UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU18 62,105 UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU0 82,238 UNC_M_CAS_COUNT.RD [uncore_imc_3] # 8.7 MB/s memory_bandwidth_total
CPU18 22,906 UNC_M_CAS_COUNT.RD [uncore_imc_3] # 3.6 MB/s memory_bandwidth_total
CPU0 53,959 UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU18 32,990 UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_4] # 0.0 MB/s memory_bandwidth_total
CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_4] # 0.0 MB/s memory_bandwidth_total
CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU0 83,595 UNC_M_CAS_COUNT.RD [uncore_imc_5] # 8.9 MB/s memory_bandwidth_total
CPU18 110,151 UNC_M_CAS_COUNT.RD [uncore_imc_5] # 10.5 MB/s memory_bandwidth_total
CPU0 56,540 UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU18 53,816 UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU0 1,003,353,416 ns duration_time
```

Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
---
tools/perf/util/metricgroup.c | 2 ++
tools/perf/util/stat-shadow.c | 31 +++++++++++++++++++++++++++----
2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index ca3e0404f187..c33ffee837ca 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -44,6 +44,8 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events,
if (!metric_events)
return NULL;

+ if (evsel->metric_leader)
+ me.evsel = evsel->metric_leader;
nd = rblist__find(metric_events, &me);
if (nd)
return container_of(nd, struct metric_event, nd);
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index b3a25659b9e6..c745337f50a4 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -356,6 +356,7 @@ static void print_nsecs(struct perf_stat_config *config,
}

static int prepare_metric(const struct metric_expr *mexp,
+ const struct evsel *evsel,
struct expr_parse_ctx *pctx,
int aggr_idx)
{
@@ -399,8 +400,29 @@ static int prepare_metric(const struct metric_expr *mexp,
source_count = 1;
} else {
struct perf_stat_evsel *ps = metric_events[i]->stats;
- struct perf_stat_aggr *aggr = &ps->aggr[aggr_idx];
+ struct perf_stat_aggr *aggr;

+ /*
+ * If there are multiple uncore PMUs and we're not
+ * reading the leader's stats, determine the stats for
+ * the appropriate uncore PMU.
+ */
+ if (evsel && evsel->metric_leader &&
+ evsel->pmu != evsel->metric_leader->pmu &&
+ mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {
+ struct evsel *pos;
+
+ evlist__for_each_entry(evsel->evlist, pos) {
+ if (pos->pmu != evsel->pmu)
+ continue;
+ if (pos->metric_leader != mexp->metric_events[i])
+ continue;
+ ps = pos->stats;
+ source_count = 1;
+ break;
+ }
+ }
+ aggr = &ps->aggr[aggr_idx];
if (!aggr)
break;

@@ -421,7 +443,8 @@ static int prepare_metric(const struct metric_expr *mexp,
* metric.
*/
val = aggr->counts.val * (1.0 / metric_events[i]->scale);
- source_count = evsel__source_count(metric_events[i]);
+ if (!source_count)
+ source_count = evsel__source_count(metric_events[i]);
}
}
n = strdup(evsel__metric_id(metric_events[i]));
@@ -468,7 +491,7 @@ static void generic_metric(struct perf_stat_config *config,
pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list);
pctx->sctx.runtime = runtime;
pctx->sctx.system_wide = config->system_wide;
- i = prepare_metric(mexp, pctx, aggr_idx);
+ i = prepare_metric(mexp, evsel, pctx, aggr_idx);
if (i < 0) {
expr__ctx_free(pctx);
return;
@@ -529,7 +552,7 @@ double test_generic_metric(struct metric_expr *mexp, int aggr_idx)
if (!pctx)
return NAN;

- if (prepare_metric(mexp, pctx, aggr_idx) < 0)
+ if (prepare_metric(mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
goto out;

if (expr__parse(&ratio, pctx, mexp->metric_expr))
--
2.43.0.594.gd9cf4e227d-goog