[PATCH v1 7/9] perf stat: Use affinity for opening events

From: Andi Kleen
Date: Sun Oct 20 2019 - 12:14:14 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

Restructure the event opening in perf stat to cycle through
the events by CPU after setting affinity to that CPU.
This eliminates IPI overhead in the perf API.

We have to loop through the CPU in the outter builtin-stat
code instead of leaving that to low level functions.

It has to change the weak group fallback strategy slightly.
Since we cannot easily undo the opens for other CPUs
move the weak group retry to a separate loop.

Before with a large test case with 94 CPUs:

% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
42.75 4.050910 67 60046 110 perf_event_open

After:

26.86 0.944396 16 58069 110 perf_event_open

(the number changes slightly because the weak group retries
work differently and the test case relies on weak groups)

Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
tools/perf/builtin-record.c | 2 +-
tools/perf/builtin-stat.c | 194 +++++++++++++++++++++++++--------
tools/perf/tests/event-times.c | 4 +-
tools/perf/util/evlist.c | 6 +-
tools/perf/util/evlist.h | 3 +-
tools/perf/util/evsel.c | 18 ++-
tools/perf/util/evsel.h | 5 +-
tools/perf/util/stat.c | 5 +-
tools/perf/util/stat.h | 3 +-
9 files changed, 180 insertions(+), 60 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1447004eee8a..b6faa43b170e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -773,7 +773,7 @@ static int record__open(struct record *rec)
if ((errno == EINVAL || errno == EBADF) &&
pos->leader != pos &&
pos->weak_group) {
- pos = perf_evlist__reset_weak_group(evlist, pos);
+ pos = perf_evlist__reset_weak_group(evlist, pos, true);
goto try_again;
}
rc = -errno;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7e17bf9f700a..651ee9ddf0a3 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -64,6 +64,7 @@
#include "util/target.h"
#include "util/time-utils.h"
#include "util/top.h"
+#include "util/affinity.h"
#include "asm/bug.h"

#include <linux/time64.h>
@@ -418,6 +419,58 @@ static bool is_target_alive(struct target *_target,
return false;
}

+enum counter_recovery {
+ COUNTER_SKIP,
+ COUNTER_RETRY,
+ COUNTER_FATAL,
+};
+
+static enum counter_recovery stat_handle_error(struct evsel *counter)
+{
+ char msg[BUFSIZ];
+ /*
+ * PPC returns ENXIO for HW counters until 2.6.37
+ * (behavior changed with commit b0a873e).
+ */
+ if (errno == EINVAL || errno == ENOSYS ||
+ errno == ENOENT || errno == EOPNOTSUPP ||
+ errno == ENXIO) {
+ if (verbose > 0)
+ ui__warning("%s event is not supported by the kernel.\n",
+ perf_evsel__name(counter));
+ counter->supported = false;
+ counter->errored = true;
+
+ if ((counter->leader != counter) ||
+ !(counter->leader->core.nr_members > 1))
+ return COUNTER_SKIP;
+ } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
+ if (verbose > 0)
+ ui__warning("%s\n", msg);
+ return COUNTER_RETRY;
+ } else if (target__has_per_thread(&target) &&
+ evsel_list->core.threads &&
+ evsel_list->core.threads->err_thread != -1) {
+ /*
+ * For global --per-thread case, skip current
+ * error thread.
+ */
+ if (!thread_map__remove(evsel_list->core.threads,
+ evsel_list->core.threads->err_thread)) {
+ evsel_list->core.threads->err_thread = -1;
+ return COUNTER_RETRY;
+ }
+ }
+
+ perf_evsel__open_strerror(counter, &target,
+ errno, msg, sizeof(msg));
+ ui__error("%s\n", msg);
+
+ if (child_pid != -1)
+ kill(child_pid, SIGTERM);
+ return COUNTER_FATAL;
+}
+
static int __run_perf_stat(int argc, const char **argv, int run_idx)
{
int interval = stat_config.interval;
@@ -426,11 +479,15 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
char msg[BUFSIZ];
unsigned long long t0, t1;
struct evsel *counter;
+ struct perf_cpu_map *cpus;
struct timespec ts;
size_t l;
int status = 0;
const bool forks = (argc > 0);
bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
+ struct affinity affinity;
+ int i;
+ bool second_pass = false;

if (interval) {
ts.tv_sec = interval / USEC_PER_MSEC;
@@ -455,66 +512,113 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
if (group)
perf_evlist__set_leader(evsel_list);

- evlist__for_each_entry(evsel_list, counter) {
+ if (affinity__setup(&affinity) < 0)
+ return -1;
+
+ cpus = evlist__cpu_iter_start(evsel_list);
+
+ for (i = 0; i < cpus->nr; i++) {
+ int cpu = cpus->map[i];
+
+ affinity__set(&affinity, cpu);
+ evlist__for_each_entry(evsel_list, counter) {
+ if (evlist__cpu_iter_skip(counter, cpu))
+ continue;
+ if (counter->reset_group || counter->errored)
+ continue;
+ evlist__cpu_iter_next(counter);
try_again:
- if (create_perf_stat_counter(counter, &stat_config, &target) < 0) {
-
- /* Weak group failed. Reset the group. */
- if ((errno == EINVAL || errno == EBADF) &&
- counter->leader != counter &&
- counter->weak_group) {
- counter = perf_evlist__reset_weak_group(evsel_list, counter);
- goto try_again;
- }
+ if (create_perf_stat_counter(counter, &stat_config, &target,
+ counter->cpu_index - 1) < 0) {

- /*
- * PPC returns ENXIO for HW counters until 2.6.37
- * (behavior changed with commit b0a873e).
- */
- if (errno == EINVAL || errno == ENOSYS ||
- errno == ENOENT || errno == EOPNOTSUPP ||
- errno == ENXIO) {
- if (verbose > 0)
- ui__warning("%s event is not supported by the kernel.\n",
- perf_evsel__name(counter));
- counter->supported = false;
-
- if ((counter->leader != counter) ||
- !(counter->leader->core.nr_members > 1))
- continue;
- } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
- if (verbose > 0)
- ui__warning("%s\n", msg);
- goto try_again;
- } else if (target__has_per_thread(&target) &&
- evsel_list->core.threads &&
- evsel_list->core.threads->err_thread != -1) {
/*
- * For global --per-thread case, skip current
- * error thread.
+ * Weak group failed. We cannot just undo this here
+ * because earlier CPUs might be in group mode, and the kernel
+ * doesn't support mixing group and non group reads. Defer
+ * it to later.
+ * Don't close here because we're in the wrong affinity.
*/
- if (!thread_map__remove(evsel_list->core.threads,
- evsel_list->core.threads->err_thread)) {
- evsel_list->core.threads->err_thread = -1;
+ if ((errno == EINVAL || errno == EBADF) &&
+ counter->leader != counter &&
+ counter->weak_group) {
+ perf_evlist__reset_weak_group(evsel_list, counter, false);
+ assert(counter->reset_group);
+ second_pass = true;
+ continue;
+ }
+
+ switch (stat_handle_error(counter)) {
+ case COUNTER_FATAL:
+ return -1;
+ case COUNTER_RETRY:
goto try_again;
+ case COUNTER_SKIP:
+ continue;
+ default:
+ break;
}
+
}
+ counter->supported = true;
+ }
+ }

- perf_evsel__open_strerror(counter, &target,
- errno, msg, sizeof(msg));
- ui__error("%s\n", msg);
+ if (second_pass) {
+ /*
+ * Now redo all the weak group after closing them,
+ * and also close errored counters.
+ */

- if (child_pid != -1)
- kill(child_pid, SIGTERM);
+ cpus = evlist__cpu_iter_start(evsel_list);
+ for (i = 0; i < cpus->nr; i++) {
+ int cpu = cpus->map[i];

- return -1;
+ affinity__set(&affinity, cpu);
+ /* First close errored or weak retry */
+ evlist__for_each_entry(evsel_list, counter) {
+ if (!counter->reset_group && !counter->errored)
+ continue;
+ if (evlist__cpu_iter_skip(counter, cpu))
+ continue;
+ perf_evsel__close_cpu(&counter->core, counter->cpu_index);
+ }
+ /* Now reopen */
+ evlist__for_each_entry(evsel_list, counter) {
+ if (!counter->reset_group || counter->errored)
+ continue;
+ if (evlist__cpu_iter_skip(counter, cpu))
+ continue;
+ evlist__cpu_iter_next(counter);
+try_again_reset:
+ pr_debug2("reopening weak %s\n", perf_evsel__name(counter));
+ if (create_perf_stat_counter(counter, &stat_config, &target,
+ counter->cpu_index - 1) < 0) {
+
+ switch (stat_handle_error(counter)) {
+ case COUNTER_FATAL:
+ return -1;
+ case COUNTER_RETRY:
+ goto try_again_reset;
+ case COUNTER_SKIP:
+ continue;
+ default:
+ break;
+ }
+ }
+ counter->supported = true;
+ }
}
- counter->supported = true;
+ }
+ affinity__cleanup(&affinity);

+ evlist__for_each_entry(evsel_list, counter) {
+ if (!counter->supported) {
+ perf_evsel__free_fd(&counter->core);
+ continue;
+ }
l = strlen(counter->unit);
if (l > stat_config.unit_width)
stat_config.unit_width = l;
-
if (perf_evsel__should_store_id(counter) &&
perf_evsel__store_ids(counter, evsel_list))
return -1;
diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c
index d824a726906c..b6d5103e5b30 100644
--- a/tools/perf/tests/event-times.c
+++ b/tools/perf/tests/event-times.c
@@ -126,7 +126,7 @@ static int attach__cpu_disabled(struct evlist *evlist)

evsel->core.attr.disabled = 1;

- err = perf_evsel__open_per_cpu(evsel, cpus);
+ err = perf_evsel__open_per_cpu(evsel, cpus, -1);
if (err) {
if (err == -EACCES)
return TEST_SKIP;
@@ -153,7 +153,7 @@ static int attach__cpu_enabled(struct evlist *evlist)
return -1;
}

- err = perf_evsel__open_per_cpu(evsel, cpus);
+ err = perf_evsel__open_per_cpu(evsel, cpus, -1);
if (err == -EACCES)
return TEST_SKIP;

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index f36f7eb64091..d9da9fe13933 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1822,7 +1822,8 @@ void perf_evlist__force_leader(struct evlist *evlist)
}

struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list,
- struct evsel *evsel)
+ struct evsel *evsel,
+ bool close)
{
struct evsel *c2, *leader;
bool is_open = true;
@@ -1839,10 +1840,11 @@ struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list,
if (c2 == evsel)
is_open = false;
if (c2->leader == leader) {
- if (is_open)
+ if (is_open && close)
perf_evsel__close(&c2->core);
c2->leader = c2;
c2->core.nr_members = 0;
+ c2->reset_group = true;
}
}
return leader;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 3f1268b9347b..5ef14004be62 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -324,5 +324,6 @@ bool perf_evlist__exclude_kernel(struct evlist *evlist);
void perf_evlist__force_leader(struct evlist *evlist);

struct evsel *perf_evlist__reset_weak_group(struct evlist *evlist,
- struct evsel *evsel);
+ struct evsel *evsel,
+ bool close);
#endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 8cdf7c3d5f31..394fceb4bf31 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1735,8 +1735,9 @@ static int perf_event_open(struct evsel *evsel,
return fd;
}

-int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
- struct perf_thread_map *threads)
+static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
+ struct perf_thread_map *threads,
+ int start_cpu, int end_cpu)
{
int cpu, thread, nthreads;
unsigned long flags = PERF_FLAG_FD_CLOEXEC;
@@ -1813,7 +1814,7 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,

display_attr(&evsel->core.attr);

- for (cpu = 0; cpu < cpus->nr; cpu++) {
+ for (cpu = start_cpu; cpu < end_cpu; cpu++) {

for (thread = 0; thread < nthreads; thread++) {
int fd, group_fd;
@@ -1991,6 +1992,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
return err;
}

+int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
+ struct perf_thread_map *threads)
+{
+ return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1);
+}
+
void evsel__close(struct evsel *evsel)
{
perf_evsel__close(&evsel->core);
@@ -1998,9 +2005,10 @@ void evsel__close(struct evsel *evsel)
}

int perf_evsel__open_per_cpu(struct evsel *evsel,
- struct perf_cpu_map *cpus)
+ struct perf_cpu_map *cpus,
+ int cpu)
{
- return evsel__open(evsel, cpus, NULL);
+ return evsel__open_cpu(evsel, cpus, NULL, cpu, cpu + 1);
}

int perf_evsel__open_per_thread(struct evsel *evsel,
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 169bb896caf3..28692abe1d42 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -171,6 +171,8 @@ struct evsel {
struct evsel *metric_leader;
bool collect_stat;
bool weak_group;
+ bool reset_group;
+ bool errored;
bool percore;
int cpu_index;
const char *pmu_name;
@@ -307,7 +309,8 @@ int evsel__enable(struct evsel *evsel);
int evsel__disable(struct evsel *evsel);

int perf_evsel__open_per_cpu(struct evsel *evsel,
- struct perf_cpu_map *cpus);
+ struct perf_cpu_map *cpus,
+ int cpu);
int perf_evsel__open_per_thread(struct evsel *evsel,
struct perf_thread_map *threads);
int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 8f1ea27f976f..e1c4c8c3c998 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -445,7 +445,8 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp)

int create_perf_stat_counter(struct evsel *evsel,
struct perf_stat_config *config,
- struct target *target)
+ struct target *target,
+ int cpu)
{
struct perf_event_attr *attr = &evsel->core.attr;
struct evsel *leader = evsel->leader;
@@ -489,7 +490,7 @@ int create_perf_stat_counter(struct evsel *evsel,
}

if (target__has_cpu(target) && !target__has_per_thread(target))
- return perf_evsel__open_per_cpu(evsel, evsel__cpus(evsel));
+ return perf_evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu);

return perf_evsel__open_per_thread(evsel, evsel->core.threads);
}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 14fe3e548229..0554b17b7ee0 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -209,7 +209,8 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp);

int create_perf_stat_counter(struct evsel *evsel,
struct perf_stat_config *config,
- struct target *target);
+ struct target *target,
+ int cpu);
int perf_stat_synthesize_config(struct perf_stat_config *config,
struct perf_tool *tool,
struct evlist *evlist,
--
2.21.0