Re: [PATCH 5/5] perf record: Allow multiple recording time ranges
From: Ian Rogers
Date: Wed Aug 24 2022 - 11:53:18 EST
On Wed, Aug 24, 2022 at 12:28 AM Adrian Hunter <adrian.hunter@xxxxxxxxx> wrote:
>
> AUX area traces can produce too much data to record successfully or
> analyze subsequently. Add another means to reduce data collection by
> allowing multiple recording time ranges.
>
> This is useful, for instance, in cases where a workload produces
> predictably reproducible events in specific time ranges.
>
> Today we only have perf record -D <msecs> to start at a specific region, or
> some complicated approach using snapshot mode and external scripts sending
> signals or using the fifos. But these approaches are difficult to set up
> compared with simply having perf do it.
>
> Extend perf record option -D/--delay option to specifying relative time
> stamps for start stop controlled by perf with the right time offset, for
> instance:
>
> perf record -e intel_pt// -D 10-20,30-40
>
> to record 10ms to 20ms into the trace and 30ms to 40ms.
Could you add a test like this in:
https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/tools/perf/tests/shell/record.sh?h=perf/core
If for no other reason than code coverage. It would also be
interesting to see what happens if floating point values are passed,
ranges are incomplete, etc. As the functionality is generic the event
could be cycles or instructions rather than intel_pt.
Thanks,
Ian
> Example:
>
> The example workload is:
>
> $ cat repeat-usleep.c
>
> int usleep(useconds_t usec);
>
> int usage(int ret, const char *msg)
> {
> if (msg)
> fprintf(stderr, "%s\n", msg);
>
> fprintf(stderr, "Usage is: repeat-usleep <microseconds>\n");
>
> return ret;
> }
>
> int main(int argc, char *argv[])
> {
> unsigned long usecs;
> char *end_ptr;
>
> if (argc != 2)
> return usage(1, "Error: Wrong number of arguments!");
>
> errno = 0;
> usecs = strtoul(argv[1], &end_ptr, 0);
> if (errno || *end_ptr || usecs > UINT_MAX)
> return usage(1, "Error: Invalid argument!");
>
> while (1) {
> int ret = usleep(usecs);
>
> if (ret & errno != EINTR)
> return usage(1, "Error: usleep() failed!");
> }
>
> return 0;
> }
>
> $ perf record -e intel_pt//u --delay 10-20,40-70,110-160 -- ./repeat-usleep 500
> Events disabled
> Events enabled
> Events disabled
> Events enabled
> Events disabled
> Events enabled
> Events disabled
> [ perf record: Woken up 5 times to write data ]
> [ perf record: Captured and wrote 0.204 MB perf.data ]
> Terminated
>
> A dlfilter is used to determine continuous data collection (timestamps
> less than 1ms apart):
>
> $ cat dlfilter-show-delays.c
>
> static __u64 start_time;
> static __u64 last_time;
>
> int start(void **data, void *ctx)
> {
> printf("%-17s\t%-9s\t%-6s\n", " Time", " Duration", " Delay");
> return 0;
> }
>
> int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
> {
> __u64 delta;
>
> if (!sample->time)
> return 1;
> if (!last_time)
> goto out;
> delta = sample->time - last_time;
> if (delta < 1000000)
> goto out2;;
> printf("%17.9f\t%9.1f\t%6.1f\n", start_time / 1000000000.0, (last_time - start_time) / 1000000.0, delta / 1000000.0);
> out:
> start_time = sample->time;
> out2:
> last_time = sample->time;
> return 1;
> }
>
> int stop(void *data, void *ctx)
> {
> printf("%17.9f\t%9.1f\n", start_time / 1000000000.0, (last_time - start_time) / 1000000.0);
> return 0;
> }
>
> The result shows the times roughly match the --delay option:
>
> $ perf script --itrace=qb --dlfilter dlfilter-show-delays.so
> Time Duration Delay
> 39215.302317300 9.7 20.5
> 39215.332480217 30.4 40.9
> 39215.403837717 49.8
>
> Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
> ---
> tools/perf/Documentation/perf-record.txt | 6 +-
> tools/perf/builtin-record.c | 24 ++-
> tools/perf/util/evlist.c | 234 +++++++++++++++++++++++
> tools/perf/util/evlist.h | 9 +
> 4 files changed, 269 insertions(+), 4 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
> index 099817ef5150..953b9663522a 100644
> --- a/tools/perf/Documentation/perf-record.txt
> +++ b/tools/perf/Documentation/perf-record.txt
> @@ -430,8 +430,10 @@ if combined with -a or -C options.
> -D::
> --delay=::
> After starting the program, wait msecs before measuring (-1: start with events
> -disabled). This is useful to filter out the startup phase of the program, which
> -is often very different.
> +disabled), or enable events only for specified ranges of msecs (e.g.
> +-D 10-20,30-40 means wait 10 msecs, enable for 10 msecs, wait 10 msecs, enable
> +for 10 msecs, then stop). Note, delaying enabling of events is useful to filter
> +out the startup phase of the program, which is often very different.
>
> -I::
> --intr-regs::
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index cefb3028f565..7fdb1dd9a0a8 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -2498,6 +2498,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
> }
> }
>
> + err = event_enable_timer__start(rec->evlist->eet);
> + if (err)
> + goto out_child;
> +
> trigger_ready(&auxtrace_snapshot_trigger);
> trigger_ready(&switch_output_trigger);
> perf_hooks__invoke_record_start();
> @@ -2621,6 +2625,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
> }
> }
>
> + err = event_enable_timer__process(rec->evlist->eet);
> + if (err < 0)
> + goto out_child;
> + if (err) {
> + err = 0;
> + done = 1;
> + }
> +
> /*
> * When perf is starting the traced process, at the end events
> * die with the process and we wait for that. Thus no need to
> @@ -2842,6 +2854,12 @@ static int perf_record_config(const char *var, const char *value, void *cb)
> return 0;
> }
>
> +static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
> +{
> + struct record *rec = (struct record *)opt->value;
> +
> + return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
> +}
>
> static int record__parse_affinity(const struct option *opt, const char *str, int unset)
> {
> @@ -3303,8 +3321,10 @@ static struct option __record_options[] = {
> OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
> "monitor event in cgroup name only",
> parse_cgroups),
> - OPT_INTEGER('D', "delay", &record.opts.initial_delay,
> - "ms to wait before starting measurement after program start (-1: start with events disabled)"),
> + OPT_CALLBACK('D', "delay", &record, "ms",
> + "ms to wait before starting measurement after program start (-1: start with events disabled), "
> + "or ranges of time to enable events e.g. '-D 10-20,30-40'",
> + record__parse_event_enable_time),
> OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
> OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
> "user to profile"),
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index 3cfe730c12b8..fcfe5bcc0bcf 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -15,6 +15,7 @@
> #include "target.h"
> #include "evlist.h"
> #include "evsel.h"
> +#include "record.h"
> #include "debug.h"
> #include "units.h"
> #include "bpf_counter.h"
> @@ -40,12 +41,14 @@
> #include <sys/ioctl.h>
> #include <sys/mman.h>
> #include <sys/prctl.h>
> +#include <sys/timerfd.h>
>
> #include <linux/bitops.h>
> #include <linux/hash.h>
> #include <linux/log2.h>
> #include <linux/err.h>
> #include <linux/string.h>
> +#include <linux/time64.h>
> #include <linux/zalloc.h>
> #include <perf/evlist.h>
> #include <perf/evsel.h>
> @@ -147,6 +150,7 @@ static void evlist__purge(struct evlist *evlist)
>
> void evlist__exit(struct evlist *evlist)
> {
> + event_enable_timer__exit(&evlist->eet);
> zfree(&evlist->mmap);
> zfree(&evlist->overwrite_mmap);
> perf_evlist__exit(&evlist->core);
> @@ -2167,6 +2171,236 @@ int evlist__ctlfd_process(struct evlist *evlist, enum evlist_ctl_cmd *cmd)
> return err;
> }
>
> +/**
> + * struct event_enable_time - perf record -D/--delay single time range.
> + * @start: start of time range to enable events in milliseconds
> + * @end: end of time range to enable events in milliseconds
> + *
> + * N.B. this structure is also accessed as an array of int.
> + */
> +struct event_enable_time {
> + int start;
> + int end;
> +};
> +
> +static int parse_event_enable_time(const char *str, struct event_enable_time *range, bool first)
> +{
> + const char *fmt = first ? "%u - %u %n" : " , %u - %u %n";
> + int ret, start, end, n;
> +
> + ret = sscanf(str, fmt, &start, &end, &n);
> + if (ret != 2 || end <= start)
> + return -EINVAL;
> + if (range) {
> + range->start = start;
> + range->end = end;
> + }
> + return n;
> +}
> +
> +static ssize_t parse_event_enable_times(const char *str, struct event_enable_time *range)
> +{
> + int incr = !!range;
> + bool first = true;
> + ssize_t ret, cnt;
> +
> + for (cnt = 0; *str; cnt++) {
> + ret = parse_event_enable_time(str, range, first);
> + if (ret < 0)
> + return ret;
> + /* Check no overlap */
> + if (!first && range && range->start <= range[-1].end)
> + return -EINVAL;
> + str += ret;
> + range += incr;
> + first = false;
> + }
> + return cnt;
> +}
> +
> +/**
> + * struct event_enable_timer - control structure for perf record -D/--delay.
> + * @evlist: event list
> + * @times: time ranges that events are enabled (N.B. this is also accessed as an
> + * array of int)
> + * @times_cnt: number of time ranges
> + * @timerfd: timer file descriptor
> + * @pollfd_pos: position in @evlist array of file descriptors to poll (fdarray)
> + * @times_step: current position in (int *)@times)[],
> + * refer event_enable_timer__process()
> + *
> + * Note, this structure is only used when there are time ranges, not when there
> + * is only an initial delay.
> + */
> +struct event_enable_timer {
> + struct evlist *evlist;
> + struct event_enable_time *times;
> + size_t times_cnt;
> + int timerfd;
> + int pollfd_pos;
> + size_t times_step;
> +};
> +
> +static int str_to_delay(const char *str)
> +{
> + char *endptr;
> + long d;
> +
> + d = strtol(str, &endptr, 10);
> + if (*endptr || d > INT_MAX || d < -1)
> + return 0;
> + return d;
> +}
> +
> +int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *opts,
> + const char *str, int unset)
> +{
> + enum fdarray_flags flags = fdarray_flag__nonfilterable | fdarray_flag__non_perf_event;
> + struct event_enable_timer *eet;
> + ssize_t times_cnt;
> + ssize_t ret;
> + int err;
> +
> + if (unset)
> + return 0;
> +
> + opts->initial_delay = str_to_delay(str);
> + if (opts->initial_delay)
> + return 0;
> +
> + ret = parse_event_enable_times(str, NULL);
> + if (ret < 0)
> + return ret;
> +
> + times_cnt = ret;
> + if (times_cnt == 0)
> + return -EINVAL;
> +
> + eet = zalloc(sizeof(*eet));
> + if (!eet)
> + return -ENOMEM;
> +
> + eet->times = calloc(times_cnt, sizeof(*eet->times));
> + if (!eet->times) {
> + err = -ENOMEM;
> + goto free_eet;
> + }
> +
> + if (parse_event_enable_times(str, eet->times) != times_cnt) {
> + err = -EINVAL;
> + goto free_eet_times;
> + }
> +
> + eet->times_cnt = times_cnt;
> +
> + eet->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
> + if (eet->timerfd == -1) {
> + err = -errno;
> + pr_err("timerfd_create failed: %s\n", strerror(errno));
> + goto free_eet_times;
> + }
> +
> + eet->pollfd_pos = perf_evlist__add_pollfd(&evlist->core, eet->timerfd, NULL, POLLIN, flags);
> + if (eet->pollfd_pos < 0) {
> + err = eet->pollfd_pos;
> + goto close_timerfd;
> + }
> +
> + eet->evlist = evlist;
> + evlist->eet = eet;
> + opts->initial_delay = eet->times[0].start;
> +
> + return 0;
> +
> +close_timerfd:
> + close(eet->timerfd);
> +free_eet_times:
> + free(eet->times);
> +free_eet:
> + free(eet);
> + return err;
> +}
> +
> +static int event_enable_timer__set_timer(struct event_enable_timer *eet, int ms)
> +{
> + struct itimerspec its = {
> + .it_value.tv_sec = ms / MSEC_PER_SEC,
> + .it_value.tv_nsec = (ms % MSEC_PER_SEC) * NSEC_PER_MSEC,
> + };
> + int err = 0;
> +
> + if (timerfd_settime(eet->timerfd, 0, &its, NULL) < 0) {
> + err = -errno;
> + pr_err("timerfd_settime failed: %s\n", strerror(errno));
> + }
> + return err;
> +}
> +
> +int event_enable_timer__start(struct event_enable_timer *eet)
> +{
> + int ms;
> +
> + if (!eet)
> + return 0;
> +
> + ms = eet->times[0].end - eet->times[0].start;
> + eet->times_step = 1;
> +
> + return event_enable_timer__set_timer(eet, ms);
> +}
> +
> +int event_enable_timer__process(struct event_enable_timer *eet)
> +{
> + struct pollfd *entries;
> + short revents;
> +
> + if (!eet)
> + return 0;
> +
> + entries = eet->evlist->core.pollfd.entries;
> + revents = entries[eet->pollfd_pos].revents;
> + entries[eet->pollfd_pos].revents = 0;
> +
> + if (revents & POLLIN) {
> + size_t step = eet->times_step;
> + size_t pos = step / 2;
> +
> + if (step & 1) {
> + evlist__disable_non_dummy(eet->evlist);
> + pr_info(EVLIST_DISABLED_MSG);
> + if (pos >= eet->times_cnt - 1) {
> + /* Disarm timer */
> + event_enable_timer__set_timer(eet, 0);
> + return 1; /* Stop */
> + }
> + } else {
> + evlist__enable_non_dummy(eet->evlist);
> + pr_info(EVLIST_ENABLED_MSG);
> + }
> +
> + step += 1;
> + pos = step / 2;
> +
> + if (pos < eet->times_cnt) {
> + int *times = (int *)eet->times; /* Accessing 'times' as array of int */
> + int ms = times[step] - times[step - 1];
> +
> + eet->times_step = step;
> + return event_enable_timer__set_timer(eet, ms);
> + }
> + }
> +
> + return 0;
> +}
> +
> +void event_enable_timer__exit(struct event_enable_timer **ep)
> +{
> + if (!ep || !*ep)
> + return;
> + free((*ep)->times);
> + zfree(ep);
> +}
> +
> struct evsel *evlist__find_evsel(struct evlist *evlist, int idx)
> {
> struct evsel *evsel;
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index 3a8474406738..9d967fe3953a 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -48,6 +48,8 @@ enum bkw_mmap_state {
> BKW_MMAP_EMPTY,
> };
>
> +struct event_enable_timer;
> +
> struct evlist {
> struct perf_evlist core;
> bool enabled;
> @@ -79,6 +81,7 @@ struct evlist {
> int ack; /* ack file descriptor for control commands */
> int pos; /* index at evlist core object to check signals */
> } ctl_fd;
> + struct event_enable_timer *eet;
> };
>
> struct evsel_str_handler {
> @@ -426,6 +429,12 @@ int evlist__ctlfd_ack(struct evlist *evlist);
> #define EVLIST_ENABLED_MSG "Events enabled\n"
> #define EVLIST_DISABLED_MSG "Events disabled\n"
>
> +int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *opts,
> + const char *str, int unset);
> +int event_enable_timer__start(struct event_enable_timer *eet);
> +void event_enable_timer__exit(struct event_enable_timer **ep);
> +int event_enable_timer__process(struct event_enable_timer *eet);
> +
> struct evsel *evlist__find_evsel(struct evlist *evlist, int idx);
>
> int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf);
> --
> 2.25.1
>