Re: [PATCH v1 2/3] perf metrics: Compute unmerged uncore metrics individually

From: Namhyung Kim
Date: Mon Feb 05 2024 - 21:03:11 EST


On Thu, Feb 1, 2024 at 6:25 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
>
> When merging counts from multiple uncore PMUs the metric is only
> computed for the metric leader. When merging/aggregation is disabled,
> prior to this patch just the leader's metric would be computed. Fix
> this by computing the metric for each PMU.
>
> On a SkylakeX:
> Before:
> ```
> $ perf stat -A -M memory_bandwidth_total -a sleep 1
>
> Performance counter stats for 'system wide':
>
> CPU0 82,217 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 92 MB/s memory_bandwidth_total
> CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 00 MB/s memory_bandwidth_total
> CPU0 61,395 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_1]
> CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_1]
> CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> CPU0 81,570 UNC_M_CAS_COUNT.RD [uncore_imc_2]
> CPU18 113,886 UNC_M_CAS_COUNT.RD [uncore_imc_2]
> CPU0 62,330 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> CPU18 66,942 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> CPU0 75,489 UNC_M_CAS_COUNT.RD [uncore_imc_3]
> CPU18 27,958 UNC_M_CAS_COUNT.RD [uncore_imc_3]
> CPU0 55,864 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> CPU18 38,727 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_4]
> CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_4]
> CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> CPU0 75,423 UNC_M_CAS_COUNT.RD [uncore_imc_5]
> CPU18 104,527 UNC_M_CAS_COUNT.RD [uncore_imc_5]
> CPU0 57,596 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> CPU18 56,777 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> CPU0 1,003,440,851 ns duration_time
>
> 1.003440851 seconds time elapsed
> ```
>
> After:
> ```
> $ perf stat -A -M memory_bandwidth_total -a sleep 1
>
> Performance counter stats for 'system wide':
>
> CPU0 88,968 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 95 MB/s memory_bandwidth_total
> CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 00 MB/s memory_bandwidth_total
> CPU0 59,498 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_1] # 00 MB/s memory_bandwidth_total
> CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_1] # 00 MB/s memory_bandwidth_total
> CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> CPU0 88,635 UNC_M_CAS_COUNT.RD [uncore_imc_2] # 95 MB/s memory_bandwidth_total
> CPU18 117,975 UNC_M_CAS_COUNT.RD [uncore_imc_2] # 115 MB/s memory_bandwidth_total
> CPU0 60,829 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> CPU18 62,105 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> CPU0 82,238 UNC_M_CAS_COUNT.RD [uncore_imc_3] # 87 MB/s memory_bandwidth_total
> CPU18 22,906 UNC_M_CAS_COUNT.RD [uncore_imc_3] # 36 MB/s memory_bandwidth_total
> CPU0 53,959 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> CPU18 32,990 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_4] # 00 MB/s memory_bandwidth_total
> CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_4] # 00 MB/s memory_bandwidth_total
> CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> CPU0 83,595 UNC_M_CAS_COUNT.RD [uncore_imc_5] # 89 MB/s memory_bandwidth_total
> CPU18 110,151 UNC_M_CAS_COUNT.RD [uncore_imc_5] # 105 MB/s memory_bandwidth_total
> CPU0 56,540 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> CPU18 53,816 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> CPU0 1,003,353,416 ns duration_time
> ```
>
> Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
> ---
> tools/perf/util/metricgroup.c | 2 ++
> tools/perf/util/stat-shadow.c | 31 +++++++++++++++++++++++++++----
> 2 files changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
> index ca3e0404f187..c33ffee837ca 100644
> --- a/tools/perf/util/metricgroup.c
> +++ b/tools/perf/util/metricgroup.c
> @@ -44,6 +44,8 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> if (!metric_events)
> return NULL;
>
> + if (evsel->metric_leader)
> + me.evsel = evsel->metric_leader;
> nd = rblist__find(metric_events, &me);
> if (nd)
> return container_of(nd, struct metric_event, nd);
> diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
> index f6c9d2f98835..1be23b0eee2f 100644
> --- a/tools/perf/util/stat-shadow.c
> +++ b/tools/perf/util/stat-shadow.c
> @@ -356,6 +356,7 @@ static void print_nsecs(struct perf_stat_config *config,
> }
>
> static int prepare_metric(const struct metric_expr *mexp,
> + const struct evsel *evsel,
> struct expr_parse_ctx *pctx,
> int aggr_idx)
> {
> @@ -398,8 +399,29 @@ static int prepare_metric(const struct metric_expr *mexp,
> source_count = 1;
> } else {
> struct perf_stat_evsel *ps = mexp->metric_events[i]->stats;
> - struct perf_stat_aggr *aggr = &ps->aggr[aggr_idx];
> + struct perf_stat_aggr *aggr;
>
> + /*
> + * If there are multiple uncore PMUs and we're not
> + * reading the leader's stats, determine the stats for
> + * the appropriate uncore PMU.
> + */
> + if (evsel && evsel->metric_leader &&
> + evsel->pmu != evsel->metric_leader->pmu &&
> + mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {

Is it just to check we're in --no-aggr (or --no-merge)?
Then it'd be simpler to use stat_config->aggr_mode.

Thanks,
Namhyung


> + struct evsel *pos;
> +
> + evlist__for_each_entry(evsel->evlist, pos) {
> + if (pos->pmu != evsel->pmu)
> + continue;
> + if (pos->metric_leader != mexp->metric_events[i])
> + continue;
> + ps = pos->stats;
> + source_count = 1;
> + break;
> + }
> + }
> + aggr = &ps->aggr[aggr_idx];
> if (!aggr)
> break;
>
> @@ -420,7 +442,8 @@ static int prepare_metric(const struct metric_expr *mexp,
> * metric.
> */
> val = aggr->counts.val * (1.0 / mexp->metric_events[i]->scale);
> - source_count = evsel__source_count(mexp->metric_events[i]);
> + if (!source_count)
> + source_count = evsel__source_count(mexp->metric_events[i]);
> }
> }
> n = strdup(evsel__metric_id(mexp->metric_events[i]));
> @@ -461,7 +484,7 @@ static void generic_metric(struct perf_stat_config *config,
> pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list);
> pctx->sctx.runtime = mexp->runtime;
> pctx->sctx.system_wide = config->system_wide;
> - i = prepare_metric(mexp, pctx, aggr_idx);
> + i = prepare_metric(mexp, evsel, pctx, aggr_idx);
> if (i < 0) {
> expr__ctx_free(pctx);
> return;
> @@ -522,7 +545,7 @@ double test_generic_metric(struct metric_expr *mexp, int aggr_idx)
> if (!pctx)
> return NAN;
>
> - if (prepare_metric(mexp, pctx, aggr_idx) < 0)
> + if (prepare_metric(mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
> goto out;
>
> if (expr__parse(&ratio, pctx, mexp->metric_expr))
> --
> 2.43.0.594.gd9cf4e227d-goog
>