[RFC][PATCH v1] perf stat: FD based pause/resume for counting mode

From: Alexey Budankov
Date: Wed Sep 27 2017 - 11:25:39 EST

Next message: Levin, Alexander (Sasha Levin): "Re: [PATCH] mm: kill kmemcheck again"
Previous message: Ganesh Goudar: "pull request: linux-firmware: update cxgb4 firmware"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Hi,

Here is FD based concept for perf stat pause/resume functionality.

The patch implements asynchronous pause/resume commands for configured counters
thru file descriptors passed over new cmd-fd, cmd-ack-fd options.

The commands are sent over cmd-fd option. File descriptor passed thru cmd-ack-fd
option is employed for confirmation on command completion.

Handling of signals and the timer for processing intervals is also
implemented thru file descriptors.

Thus all sources of async events are switched to file descriptors and
multiplexed at the single place using poll system call.

initial_delay option is changed to signed int and negative values (-1) mean to
start paused so that switching state would be controlled thru cmd-fd, cmd-ack-fd
file descriptors.

I tested that as for launch mode when perf tool starts a workload as for attach
mode thru -p option. See attached logs and straces for more details.

Attached pause_test.cpp demonstrates one of the usage models for
this functionality.

Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
---
tools/perf/builtin-stat.c | 434 +++++++++++++++++++++++++++++++++-------------
tools/perf/util/evlist.c | 8 +-
2 files changed, 321 insertions(+), 121 deletions(-)

Sources of async events: enum perf_async_event

Pause/resume protocol (perf_async_cmd_done is sent back on the completion):
enum perf_async_cmd

Global state specific to this change: struct perf_async

Initialization: int perf_async_init(const int interval)

Finalization: int perf_async_fini(void)

poll() loop calling handlers below:
int perf_async_handle_events(void *param)

For signals:
int perf_async_signal_handler(union perf_async_data *data, void *param)

For timer intervals:
int perf_async_timer_handler(union perf_async_data *data, void *param)

For pause/resume commands:
int perf_async_cmd_handler(union perf_async_data *data, void *param)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 866da7a..a03e30e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -80,6 +80,9 @@
#include <sys/stat.h>
#include <sys/wait.h>
#include <unistd.h>
+#include <poll.h>
+#include <sys/signalfd.h>
+#include <sys/timerfd.h>

#include "sane_ctype.h"

@@ -156,7 +159,7 @@ static bool group = false;
static const char *pre_cmd = NULL;
static const char *post_cmd = NULL;
static bool sync_run = false;
-static unsigned int initial_delay = 0;
+static int initial_delay = 0;
static unsigned int unit_width = 4; /* strlen("unit") */
static bool forever = false;
static bool metric_only = false;
@@ -185,13 +188,49 @@ struct perf_stat {
static struct perf_stat perf_stat;
#define STAT_RECORD perf_stat.record

-static volatile int done = 0;
-
static struct perf_stat_config stat_config = {
.aggr_mode = AGGR_GLOBAL,
.scale = true,
};

+enum perf_async_event
+{
+ perf_async_signal = 0,
+ perf_async_timer,
+ perf_async_cmd,
+ perf_async_cmd_ack,
+ perf_async_event_eof
+};
+
+enum perf_async_cmd
+{
+ perf_async_cmd_pause = 0,
+ perf_async_cmd_resume,
+ perf_async_cmd_done,
+ perf_async_cmd_eof
+};
+
+union perf_async_data {
+ struct signalfd_siginfo siginfo;
+ uint64_t timer_expired;
+ enum perf_async_cmd cmd;
+};
+
+struct perf_async_handler_data {
+ int argc;
+ const char **argv;
+ int *status;
+};
+
+typedef int (*perf_async_handler_t)(union perf_async_data*, void *);
+
+struct {
+ struct pollfd fds[perf_async_event_eof];
+ perf_async_handler_t handlers[perf_async_event_eof];
+ struct itimerspec timer_settings;
+ bool paused;
+} perf_async;
+
static inline void diff_timespec(struct timespec *r, struct timespec *a,
struct timespec *b)
{
@@ -410,31 +449,6 @@ static void process_interval(void)
print_counters(&rs, 0, NULL);
}

-static void enable_counters(void)
-{
- if (initial_delay)
- usleep(initial_delay * USEC_PER_MSEC);
-
- /*
- * We need to enable counters only if:
- * - we don't have tracee (attaching to task or cpu)
- * - we have initial delay configured
- */
- if (!target__none(&target) || initial_delay)
- perf_evlist__enable(evsel_list);
-}
-
-static void disable_counters(void)
-{
- /*
- * If we don't have tracee (attaching to task or cpu), counters may
- * still be running. To get accurate group ratios, we must stop groups
- * from counting before reading their constituent counters.
- */
- if (!target__none(&target))
- perf_evlist__disable(evsel_list);
-}
-
static volatile int workload_exec_errno;

/*
@@ -582,26 +596,257 @@ static bool perf_evsel__should_store_id(struct perf_evsel *counter)
return STAT_RECORD || counter->attr.read_format & PERF_FORMAT_ID;
}

+static int perf_async_signal_handler(union perf_async_data *data, void *param)
+{
+ uint32_t signo = data->siginfo.ssi_signo;
+ int32_t code = data->siginfo.ssi_code;
+ struct perf_async_handler_data *handler_data = param;
+ char msg[BUFSIZ];
+
+ if (verbose > 1) {
+ fprintf(stat_config.output,
+ "%s: signo %d, code %d\n", __FUNCTION__, signo, code);
+ }
+
+ switch (signo) {
+ case SIGCHLD:
+ if (code == CLD_EXITED || code == CLD_KILLED || code == CLD_DUMPED) {
+ *handler_data->status = data->siginfo.ssi_status;
+
+ if (workload_exec_errno) {
+ const char *emsg =
+ str_error_r(workload_exec_errno, msg, sizeof(msg));
+ pr_err("Workload failed: %s\n", emsg);
+ return 1;
+ }
+
+ if (WIFSIGNALED(*handler_data->status))
+ psignal(WTERMSIG(*handler_data->status),
+ handler_data->argv[0]);
+
+ if (verbose > 1) {
+ fprintf(stat_config.output,
+ "%s: workload exited: status %d\n",
+ __FUNCTION__, *handler_data->status);
+ }
+
+ return 1;
+ }
+ case SIGINT:
+ if (forever) {
+ forever = !forever;
+ return 1;
+ }
+ if (!(handler_data->argc > 0))
+ return 1;
+ case SIGABRT:
+ case SIGALRM:
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int perf_async_timer_handler(union perf_async_data *data, void *param)
+{
+ param = param;
+
+ if (verbose > 1) {
+ fprintf(stat_config.output,
+ "%s: timer_expired: %"PRIu64"\n",
+ __FUNCTION__, data->timer_expired);
+ }
+
+ process_interval();
+
+ return 0;
+}
+
+static int perf_async_cmd_handler(union perf_async_data *data, void *param)
+{
+ param = param;
+
+ if (verbose > 1) {
+ fprintf(stat_config.output, "%s: cmd %d\n",
+ __FUNCTION__, data->cmd);
+ }
+
+ if (data->cmd == perf_async_cmd_resume && perf_async.paused) {
+
+ perf_evlist__enable(evsel_list);
+
+ if (perf_async.fds[perf_async_timer].fd != -1 &&
+ (perf_async.timer_settings.it_value.tv_sec ||
+ perf_async.timer_settings.it_value.tv_nsec))
+ timerfd_settime(perf_async.fds[perf_async_timer].fd,
+ 0, &perf_async.timer_settings, NULL);
+
+ perf_async.paused = !perf_async.paused;
+ if (verbose > 1) {
+ fprintf(stat_config.output,
+ "%s: events resumed\n", __FUNCTION__);
+ }
+ } else if (data->cmd == perf_async_cmd_pause && !perf_async.paused) {
+
+ struct itimerspec timer_settings = {0};
+
+ if (perf_async.fds[perf_async_timer].fd != -1 &&
+ (perf_async.timer_settings.it_value.tv_sec ||
+ perf_async.timer_settings.it_value.tv_nsec))
+ timerfd_settime(perf_async.fds[perf_async_timer].fd,
+ 0, &timer_settings, NULL);
+
+ perf_evlist__disable(evsel_list);
+
+ perf_async.paused = !perf_async.paused;
+ if (verbose > 1) {
+ fprintf(stat_config.output,
+ "%s: events paused\n", __FUNCTION__);
+ }
+ }
+
+ return 0;
+}
+
+static int perf_async_init(const int interval)
+{
+ int i = 0;
+ sigset_t mask;
+
+ sigemptyset(&mask);
+
+ for (i = 0; i < perf_async_event_eof; i++) {
+ //perf_async.fds[i].fd = -1;
+ perf_async.fds[i].events = POLLIN;
+ perf_async.fds[i].revents = 0;
+ perf_async.handlers[i] = NULL;
+ }
+
+ memset(&perf_async.timer_settings, 0,
+ sizeof(perf_async.timer_settings));
+
+ sigaddset(&mask, SIGCHLD);
+ sigaddset(&mask, SIGINT);
+ sigaddset(&mask, SIGALRM);
+ sigaddset(&mask, SIGABRT);
+
+ if (sigprocmask(SIG_BLOCK, &mask, NULL))
+ return 0;
+
+ perf_async.fds[perf_async_signal].fd = signalfd(-1, &mask, SFD_CLOEXEC);
+ if (perf_async.fds[perf_async_signal].fd == -1)
+ return 0;
+
+ perf_async.handlers[perf_async_signal] = perf_async_signal_handler;
+
+ if (interval) {
+ perf_async.timer_settings.it_interval.tv_sec =
+ interval / USEC_PER_MSEC;
+ perf_async.timer_settings.it_interval.tv_nsec =
+ (interval % USEC_PER_MSEC) * NSEC_PER_MSEC;
+ perf_async.timer_settings.it_value =
+ perf_async.timer_settings.it_interval;
+
+ perf_async.fds[perf_async_timer].fd =
+ timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
+ if (perf_async.fds[perf_async_timer].fd == -1)
+ return 0;
+
+ perf_async.handlers[perf_async_timer] =
+ perf_async_timer_handler;
+ }
+
+ perf_async.paused = true;
+ perf_async.handlers[perf_async_cmd] = perf_async_cmd_handler;
+ perf_async.fds[perf_async_cmd_ack].events = POLLOUT;
+
+ return 1;
+}
+
+static int perf_async_fini(void)
+{
+ int i = 0;
+ sigset_t mask;
+
+ sigemptyset(&mask);
+
+ sigaddset(&mask, SIGCHLD);
+ sigaddset(&mask, SIGINT);
+ sigaddset(&mask, SIGALRM);
+ sigaddset(&mask, SIGABRT);
+
+ sigprocmask(SIG_UNBLOCK, &mask, NULL);
+
+ if (perf_async.fds[perf_async_signal].fd != -1)
+ close(perf_async.fds[perf_async_signal].fd);
+
+ if (perf_async.fds[perf_async_timer].fd != -1)
+ close(perf_async.fds[perf_async_timer].fd);
+
+ if (perf_async.fds[perf_async_cmd].fd != -1)
+ close(perf_async.fds[perf_async_cmd].fd);
+
+ if (perf_async.fds[perf_async_cmd].fd != -1)
+ close(perf_async.fds[perf_async_cmd].fd);
+
+ if (perf_async.fds[perf_async_cmd_ack].fd != -1)
+ close(perf_async.fds[perf_async_cmd_ack].fd);
+
+ for (i = 0; i < perf_async_event_eof; i++) {
+ perf_async.fds[i].fd = -1;
+ perf_async.fds[i].events = 0;
+ perf_async.fds[i].revents = 0;
+ perf_async.handlers[i] = NULL;
+ }
+
+ return 0;
+}
+
+static int perf_async_handle_events(void *param)
+{
+ union perf_async_data data;
+ int ret, i = 0;
+ bool stop = false;
+
+ while (!stop) {
+
+ ret = poll(perf_async.fds, perf_async_event_eof, -1);
+ if (!(ret > 0))
+ break;
+
+ for (i = 0; i < perf_async_event_eof; i++) {
+ if (perf_async.fds[i].revents & POLLIN) {
+ memset(&data, 0, sizeof(data));
+ ret = read(perf_async.fds[i].fd, &data, sizeof(data));
+ if (!(ret > 0))
+ continue;
+ ret = perf_async.handlers[i](&data, param);
+ if (ret)
+ stop = true;
+ } else if (perf_async.fds[i].revents & POLLOUT) {
+ enum perf_async_cmd cmd = perf_async_cmd_done;
+ ret = write(perf_async.fds[i].fd, &cmd, sizeof(cmd));
+ if (!(ret > 0))
+ continue;
+ }
+ }
+ }
+
+ return 0;
+}
+
static int __run_perf_stat(int argc, const char **argv)
{
- int interval = stat_config.interval;
char msg[BUFSIZ];
unsigned long long t0, t1;
struct perf_evsel *counter;
- struct timespec ts;
size_t l;
int status = 0;
const bool forks = (argc > 0);
bool is_pipe = STAT_RECORD ? perf_stat.file.is_pipe : false;
struct perf_evsel_config_term *err_term;
-
- if (interval) {
- ts.tv_sec = interval / USEC_PER_MSEC;
- ts.tv_nsec = (interval % USEC_PER_MSEC) * NSEC_PER_MSEC;
- } else {
- ts.tv_sec = 1;
- ts.tv_nsec = 0;
- }
+ struct perf_async_handler_data handler_data = { argc, argv, &status};

if (forks) {
if (perf_evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
@@ -697,36 +942,37 @@ static int __run_perf_stat(int argc, const char **argv)
t0 = rdclock();
clock_gettime(CLOCK_MONOTONIC, &ref_time);

- if (forks) {
+ if (forks)
perf_evlist__start_workload(evsel_list);
- enable_counters();

- if (interval) {
- while (!waitpid(child_pid, &status, WNOHANG)) {
- nanosleep(&ts, NULL);
- process_interval();
- }
- }
- wait(&status);
-
- if (workload_exec_errno) {
- const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
- pr_err("Workload failed: %s\n", emsg);
- return -1;
- }
+ if (initial_delay > 0)
+ usleep(initial_delay * USEC_PER_MSEC);

- if (WIFSIGNALED(status))
- psignal(WTERMSIG(status), argv[0]);
- } else {
- enable_counters();
- while (!done) {
- nanosleep(&ts, NULL);
- if (interval)
- process_interval();
+ /*
+ * We need to enable counters only if:
+ * - we don't have tracee (attaching to task or cpu)
+ * - we have initial delay configured
+ */
+ if (initial_delay >= 0) {
+ if (!target__none(&target) || initial_delay) {
+ union perf_async_data data;
+ data.cmd = perf_async_cmd_resume;
+ perf_async_cmd_handler(&data, NULL);
}
}

- disable_counters();
+ perf_async_handle_events(&handler_data);
+
+ /*
+ * If we don't have tracee (attaching to task or cpu), counters may
+ * still be running. To get accurate group ratios, we must stop groups
+ * from counting before reading their constituent counters.
+ */
+ if (!target__none(&target)) {
+ union perf_async_data data;
+ data.cmd = perf_async_cmd_pause;
+ perf_async_cmd_handler(&data, NULL);
+ }

t1 = rdclock();

@@ -1696,49 +1942,6 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
fflush(stat_config.output);
}

-static volatile int signr = -1;
-
-static void skip_signal(int signo)
-{
- if ((child_pid == -1) || stat_config.interval)
- done = 1;
-
- signr = signo;
- /*
- * render child_pid harmless
- * won't send SIGTERM to a random
- * process in case of race condition
- * and fast PID recycling
- */
- child_pid = -1;
-}
-
-static void sig_atexit(void)
-{
- sigset_t set, oset;
-
- /*
- * avoid race condition with SIGCHLD handler
- * in skip_signal() which is modifying child_pid
- * goal is to avoid send SIGTERM to a random
- * process
- */
- sigemptyset(&set);
- sigaddset(&set, SIGCHLD);
- sigprocmask(SIG_BLOCK, &set, &oset);
-
- if (child_pid != -1)
- kill(child_pid, SIGTERM);
-
- sigprocmask(SIG_SETMASK, &oset, NULL);
-
- if (signr == -1)
- return;
-
- signal(signr, SIG_DFL);
- kill(getpid(), signr);
-}
-
static int stat__set_big_num(const struct option *opt __maybe_unused,
const char *s __maybe_unused, int unset)
{
@@ -1811,8 +2014,10 @@ static const struct option stat_options[] = {
"aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
"aggregate counts per thread", AGGR_THREAD),
- OPT_UINTEGER('D', "delay", &initial_delay,
- "ms to wait before starting measurement after program start"),
+ OPT_INTEGER('D', "delay", &initial_delay,
+ "ms to wait (forever: -1) before starting measurement after program start"),
+ OPT_INTEGER(0, "cmd-fd", &(perf_async.fds[perf_async_cmd].fd), ""),
+ OPT_INTEGER(0, "cmd-ack-fd", &(perf_async.fds[perf_async_cmd_ack].fd), ""),
OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL,
"Only print computed metrics. No raw values", enable_metric_only),
OPT_BOOLEAN(0, "topdown", &topdown_run,
@@ -2745,18 +2950,7 @@ int cmd_stat(int argc, const char **argv)
if (perf_stat_init_aggr_mode())
goto out;

- /*
- * We dont want to block the signals - that would cause
- * child tasks to inherit that and Ctrl-C would not work.
- * What we want is for Ctrl-C to work in the exec()-ed
- * task, but being ignored by perf stat itself:
- */
- atexit(sig_atexit);
- if (!forever)
- signal(SIGINT, skip_signal);
- signal(SIGCHLD, skip_signal);
- signal(SIGALRM, skip_signal);
- signal(SIGABRT, skip_signal);
+ perf_async_init(interval);

status = 0;
for (run_idx = 0; forever || run_idx < run_count; run_idx++) {
@@ -2771,6 +2965,8 @@ int cmd_stat(int argc, const char **argv)
}
}

+ perf_async_fini();
+
if (!forever && status != -1 && !interval)
print_counters(NULL, argc, argv);

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 6a0d7ff..433e095 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1712,12 +1712,16 @@ int perf_evlist__prepare_workload(struct perf_evlist *evlist, struct target *tar

if (!evlist->workload.pid) {
int ret;
+ sigset_t mask;

- if (pipe_output)
- dup2(2, 1);
+ sigfillset(&mask);
+ sigprocmask(SIG_UNBLOCK, &mask, NULL);

signal(SIGTERM, SIG_DFL);

+ if (pipe_output)
+ dup2(2, 1);
+
close(child_ready_pipe[0]);
close(go_pipe[1]);
fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);

Attachment: pause_resume_fd.zip
Description: Zip compressed data

#define _DEBUG
#include <assert.h>
#include <sys/signalfd.h>
#include <signal.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <poll.h>
#include <sys/timerfd.h>
#include <fcntl.h> /* Obtain O_* constant definitions */
#include <vector>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int
main(int argc, char *argv[])
{
int wr_fds[2] = { -1, -1 };
int rd_fds[2] = { -1, -1 };
int status;

int
res = pipe(rd_fds);
assert(res != -1);

res = pipe(wr_fds);
assert(res != -1);

fcntl(wr_fds[0], F_SETPIPE_SZ, sizeof(int));
fcntl(wr_fds[1], F_SETPIPE_SZ, sizeof(int));
fcntl(rd_fds[0], F_SETPIPE_SZ, sizeof(int));
fcntl(rd_fds[1], F_SETPIPE_SZ, sizeof(int));

int count = atoi(argv[1]);
int delay = atoi(argv[2]);

pid_t ppid = fork();
if (ppid) {

sigset_t mask;

sigfillset(&mask);
sigprocmask(SIG_BLOCK, &mask, NULL);

close(wr_fds[0]);
close(rd_fds[1]);

fprintf(stderr, "%s: STARTED\n", __FUNCTION__);

while (--count) {

if (waitpid(ppid, &status, WNOHANG)) break;

int cmd = count % 2;

if (write(wr_fds[1], &cmd, sizeof(cmd)) < 0) break;
printf("%s: sent %d\n", __FUNCTION__, cmd);

if (read(rd_fds[0], &cmd, sizeof(cmd)) < 0) break;
printf("%s: ack %d\n", __FUNCTION__, cmd);

usleep(delay * 1000);
}

fprintf(stderr, "%s: FINISHED\n", __FUNCTION__);

wait(&status);

return 0;

} else {

//
// test 10 1000 perf stat -cmd-fd 1 --cmd-ack-fd 2 -vvv ....

// wr_fds[0] == cmd-fd
close(wr_fds[1]);
// rd_fds[1] == cmd-ack-fd
close(rd_fds[0]);

std::vector<char*> new_argv;
new_argv.push_back(argv[3]); // perfstat
new_argv.push_back(argv[4]); // stat

std::vector<char> buff(64, '\0');
new_argv.push_back("--cmd-fd");
snprintf(&buff[0], 64, "%d", wr_fds[0]);
new_argv.push_back(&buff[0]);

std::vector<char> buff2(64, '\0');
new_argv.push_back("--cmd-ack-fd");
snprintf(&buff2[0], 64, "%d", rd_fds[1]);
new_argv.push_back(&buff2[0]);

int i = 5;
for (; argv[i]; i++)
new_argv.push_back(argv[i]);

new_argv.push_back(NULL);

execv(argv[3], &new_argv[0]);
}
}

Next message: Levin, Alexander (Sasha Levin): "Re: [PATCH] mm: kill kmemcheck again"
Previous message: Ganesh Goudar: "pull request: linux-firmware: update cxgb4 firmware"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]