Re: [PATCH v1 2/3] perf metrics: Compute unmerged uncore metrics individually

From: Ian Rogers
Date: Mon Feb 05 2024 - 21:21:26 EST


On Mon, Feb 5, 2024 at 6:02 PM Namhyung Kim <namhyung@xxxxxxxxxx> wrote:
>
> On Thu, Feb 1, 2024 at 6:25 PM Ian Rogers <irogers@xxxxxxxxxx> wrote:
> >
> > When merging counts from multiple uncore PMUs the metric is only
> > computed for the metric leader. When merging/aggregation is disabled,
> > prior to this patch just the leader's metric would be computed. Fix
> > this by computing the metric for each PMU.
> >
> > On a SkylakeX:
> > Before:
> > ```
> > $ perf stat -A -M memory_bandwidth_total -a sleep 1
> >
> > Performance counter stats for 'system wide':
> >
> > CPU0 82,217 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 9.2 MB/s memory_bandwidth_total
> > CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 0.0 MB/s memory_bandwidth_total
> > CPU0 61,395 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> > CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> > CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_1]
> > CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_1]
> > CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> > CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> > CPU0 81,570 UNC_M_CAS_COUNT.RD [uncore_imc_2]
> > CPU18 113,886 UNC_M_CAS_COUNT.RD [uncore_imc_2]
> > CPU0 62,330 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> > CPU18 66,942 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> > CPU0 75,489 UNC_M_CAS_COUNT.RD [uncore_imc_3]
> > CPU18 27,958 UNC_M_CAS_COUNT.RD [uncore_imc_3]
> > CPU0 55,864 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> > CPU18 38,727 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> > CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_4]
> > CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_4]
> > CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> > CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> > CPU0 75,423 UNC_M_CAS_COUNT.RD [uncore_imc_5]
> > CPU18 104,527 UNC_M_CAS_COUNT.RD [uncore_imc_5]
> > CPU0 57,596 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> > CPU18 56,777 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> > CPU0 1,003,440,851 ns duration_time
> >
> > 1.003440851 seconds time elapsed
> > ```
> >
> > After:
> > ```
> > $ perf stat -A -M memory_bandwidth_total -a sleep 1
> >
> > Performance counter stats for 'system wide':
> >
> > CPU0 88,968 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 9.5 MB/s memory_bandwidth_total
> > CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_0] # 0.0 MB/s memory_bandwidth_total
> > CPU0 59,498 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> > CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_0]
> > CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_1] # 0.0 MB/s memory_bandwidth_total
> > CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_1] # 0.0 MB/s memory_bandwidth_total
> > CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> > CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_1]
> > CPU0 88,635 UNC_M_CAS_COUNT.RD [uncore_imc_2] # 9.5 MB/s memory_bandwidth_total
> > CPU18 117,975 UNC_M_CAS_COUNT.RD [uncore_imc_2] # 11.5 MB/s memory_bandwidth_total
> > CPU0 60,829 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> > CPU18 62,105 UNC_M_CAS_COUNT.WR [uncore_imc_2]
> > CPU0 82,238 UNC_M_CAS_COUNT.RD [uncore_imc_3] # 8.7 MB/s memory_bandwidth_total
> > CPU18 22,906 UNC_M_CAS_COUNT.RD [uncore_imc_3] # 3.6 MB/s memory_bandwidth_total
> > CPU0 53,959 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> > CPU18 32,990 UNC_M_CAS_COUNT.WR [uncore_imc_3]
> > CPU0 0 UNC_M_CAS_COUNT.RD [uncore_imc_4] # 0.0 MB/s memory_bandwidth_total
> > CPU18 0 UNC_M_CAS_COUNT.RD [uncore_imc_4] # 0.0 MB/s memory_bandwidth_total
> > CPU0 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> > CPU18 0 UNC_M_CAS_COUNT.WR [uncore_imc_4]
> > CPU0 83,595 UNC_M_CAS_COUNT.RD [uncore_imc_5] # 8.9 MB/s memory_bandwidth_total
> > CPU18 110,151 UNC_M_CAS_COUNT.RD [uncore_imc_5] # 10.5 MB/s memory_bandwidth_total
> > CPU0 56,540 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> > CPU18 53,816 UNC_M_CAS_COUNT.WR [uncore_imc_5]
> > CPU0 1,003,353,416 ns duration_time
> > ```
> >
> > Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
> > ---
> > tools/perf/util/metricgroup.c | 2 ++
> > tools/perf/util/stat-shadow.c | 31 +++++++++++++++++++++++++++----
> > 2 files changed, 29 insertions(+), 4 deletions(-)
> >
> > diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
> > index ca3e0404f187..c33ffee837ca 100644
> > --- a/tools/perf/util/metricgroup.c
> > +++ b/tools/perf/util/metricgroup.c
> > @@ -44,6 +44,8 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> > if (!metric_events)
> > return NULL;
> >
> > + if (evsel->metric_leader)
> > + me.evsel = evsel->metric_leader;
> > nd = rblist__find(metric_events, &me);
> > if (nd)
> > return container_of(nd, struct metric_event, nd);
> > diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
> > index f6c9d2f98835..1be23b0eee2f 100644
> > --- a/tools/perf/util/stat-shadow.c
> > +++ b/tools/perf/util/stat-shadow.c
> > @@ -356,6 +356,7 @@ static void print_nsecs(struct perf_stat_config *config,
> > }
> >
> > static int prepare_metric(const struct metric_expr *mexp,
> > + const struct evsel *evsel,
> > struct expr_parse_ctx *pctx,
> > int aggr_idx)
> > {
> > @@ -398,8 +399,29 @@ static int prepare_metric(const struct metric_expr *mexp,
> > source_count = 1;
> > } else {
> > struct perf_stat_evsel *ps = mexp->metric_events[i]->stats;
> > - struct perf_stat_aggr *aggr = &ps->aggr[aggr_idx];
> > + struct perf_stat_aggr *aggr;
> >
> > + /*
> > + * If there are multiple uncore PMUs and we're not
> > + * reading the leader's stats, determine the stats for
> > + * the appropriate uncore PMU.
> > + */
> > + if (evsel && evsel->metric_leader &&
> > + evsel->pmu != evsel->metric_leader->pmu &&
> > + mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {
>
> Is it just to check we're in --no-aggr (or --no-merge)?
> Then it'd be simpler to use stat_config->aggr_mode.

For most metrics the events will be on the same PMU, but there is
nothing stopping mixing events from different PMUs (grouping can be
disabled). There may also be software and tool evsels.

Thanks,
Ian

> Thanks,
> Namhyung
>
>
> > + struct evsel *pos;
> > +
> > + evlist__for_each_entry(evsel->evlist, pos) {
> > + if (pos->pmu != evsel->pmu)
> > + continue;
> > + if (pos->metric_leader != mexp->metric_events[i])
> > + continue;
> > + ps = pos->stats;
> > + source_count = 1;
> > + break;
> > + }
> > + }
> > + aggr = &ps->aggr[aggr_idx];
> > if (!aggr)
> > break;
> >
> > @@ -420,7 +442,8 @@ static int prepare_metric(const struct metric_expr *mexp,
> > * metric.
> > */
> > val = aggr->counts.val * (1.0 / mexp->metric_events[i]->scale);
> > - source_count = evsel__source_count(mexp->metric_events[i]);
> > + if (!source_count)
> > + source_count = evsel__source_count(mexp->metric_events[i]);
> > }
> > }
> > n = strdup(evsel__metric_id(mexp->metric_events[i]));
> > @@ -461,7 +484,7 @@ static void generic_metric(struct perf_stat_config *config,
> > pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list);
> > pctx->sctx.runtime = mexp->runtime;
> > pctx->sctx.system_wide = config->system_wide;
> > - i = prepare_metric(mexp, pctx, aggr_idx);
> > + i = prepare_metric(mexp, evsel, pctx, aggr_idx);
> > if (i < 0) {
> > expr__ctx_free(pctx);
> > return;
> > @@ -522,7 +545,7 @@ double test_generic_metric(struct metric_expr *mexp, int aggr_idx)
> > if (!pctx)
> > return NAN;
> >
> > - if (prepare_metric(mexp, pctx, aggr_idx) < 0)
> > + if (prepare_metric(mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
> > goto out;
> >
> > if (expr__parse(&ratio, pctx, mexp->metric_expr))
> > --
> > 2.43.0.594.gd9cf4e227d-goog
> >