Re: [PATCH 01/10] perf record: new interfaces to read ring buffer to file

From: Arnaldo Carvalho de Melo
Date: Tue Oct 10 2017 - 14:24:27 EST


Em Tue, Oct 10, 2017 at 10:20:14AM -0700, kan.liang@xxxxxxxxx escreveu:
> From: Kan Liang <kan.liang@xxxxxxxxx>
>
> Factor out the mmap_read related codes into separate functions and move
> them to evlist.c
> struct perf_mmap_read is introduced to store the specific range of ring
> buffer information.
>
> The way to read specific range of ring buffer into file is as below,
> perf_mmap__read_init
> perf_mmap__read_to_file
> perf_mmap__consume
>
> There is no functional change.

So, I started doing some changes in this direction as well, thinking
that 'struct perf_mmap' is the right place to either dump the ring
buffer into a file, at some point in a thread per 'struct perf_mmap'
instance (which for 'perf record -a' will be one per cpu).

But my intention was to do the very simple case first, i.e.:

perf record -a

Without backward ring buffer, snapshotting, etc, that would come later,
step by step.

I.e. when some of these features are requested, then we use the current
code, generating a simple perf.data file, etc.

In the simple case, we then would use one thread per 'struct perf_mmap'
and dump to separate files.

When 'perf report' or other tools 'perf script' noticed that new format,
with a directory per session, with one file per CPU, then it would
consider those to be the 'struct perf_mmap' 'frozen state' and would
read from them just like they would from real mmaped memory, again, some
more 'struct perf_mmap' instances, etc.

Please take a look at my perf/core branch, it has just the beginning of
this, perhaps we can collaborate on getting what you did on top of it,
or help me realize why this is not the way to go :-)

But I think that before getting to the more complicated bits (backward,
snapshotting, etc), 'perf top' should use threads to instead of dumping
the ring buffer contentes on files for later processing, to consume them
straight away, creating hist_entry instances in parallel and using the
refcounts and locks already in place for threads, maps, hist_entry, and
adding some more along the way to make it possible to process samples
in parallel and update the the struct machine/thread/map/symbol/dso
instances.

Meanwhile I'll continue reading your patches, thanks for working on it!

- Arnaldo

> Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
> ---
> tools/perf/builtin-record.c | 111 ++++++++------------------------------------
> tools/perf/util/evlist.c | 111 ++++++++++++++++++++++++++++++++++++++++++++
> tools/perf/util/evlist.h | 15 ++++++
> 3 files changed, 146 insertions(+), 91 deletions(-)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 234fdf4..df555e9 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -105,6 +105,17 @@ static bool switch_output_time(struct record *rec)
> trigger_is_ready(&switch_output_trigger);
> }
>
> +static void update_output_size(struct record *rec, size_t size)
> +{
> + if (size == 0)
> + return;
> +
> + rec->bytes_written += size;
> +
> + if (switch_output_size(rec))
> + trigger_hit(&switch_output_trigger);
> +}
> +
> static int record__write(struct record *rec, void *bf, size_t size)
> {
> if (perf_data_file__write(rec->session->file, bf, size) < 0) {
> @@ -112,10 +123,7 @@ static int record__write(struct record *rec, void *bf, size_t size)
> return -1;
> }
>
> - rec->bytes_written += size;
> -
> - if (switch_output_size(rec))
> - trigger_hit(&switch_output_trigger);
> + update_output_size(rec, size);
>
> return 0;
> }
> @@ -130,106 +138,27 @@ static int process_synthesized_event(struct perf_tool *tool,
> }
>
> static int
> -backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
> -{
> - struct perf_event_header *pheader;
> - u64 evt_head = head;
> - int size = mask + 1;
> -
> - pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
> - pheader = (struct perf_event_header *)(buf + (head & mask));
> - *start = head;
> - while (true) {
> - if (evt_head - head >= (unsigned int)size) {
> - pr_debug("Finished reading backward ring buffer: rewind\n");
> - if (evt_head - head > (unsigned int)size)
> - evt_head -= pheader->size;
> - *end = evt_head;
> - return 0;
> - }
> -
> - pheader = (struct perf_event_header *)(buf + (evt_head & mask));
> -
> - if (pheader->size == 0) {
> - pr_debug("Finished reading backward ring buffer: get start\n");
> - *end = evt_head;
> - return 0;
> - }
> -
> - evt_head += pheader->size;
> - pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
> - }
> - WARN_ONCE(1, "Shouldn't get here\n");
> - return -1;
> -}
> -
> -static int
> -rb_find_range(void *data, int mask, u64 head, u64 old,
> - u64 *start, u64 *end, bool backward)
> -{
> - if (!backward) {
> - *start = old;
> - *end = head;
> - return 0;
> - }
> -
> - return backward_rb_find_range(data, mask, head, start, end);
> -}
> -
> -static int
> record__mmap_read(struct record *rec, struct perf_mmap *md,
> bool overwrite, bool backward)
> {
> - u64 head = perf_mmap__read_head(md);
> - u64 old = md->prev;
> - u64 end = head, start = old;
> - unsigned char *data = md->base + page_size;
> - unsigned long size;
> - void *buf;
> - int rc = 0;
> + struct perf_mmap_read read;
>
> - if (rb_find_range(data, md->mask, head,
> - old, &start, &end, backward))
> + if (perf_mmap__read_init(md, &read, overwrite, backward))
> return -1;
>
> - if (start == end)
> + if (read.start == read.end)
> return 0;
>
> rec->samples++;
>
> - size = end - start;
> - if (size > (unsigned long)(md->mask) + 1) {
> - WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
> -
> - md->prev = head;
> - perf_mmap__consume(md, overwrite || backward);
> - return 0;
> - }
> -
> - if ((start & md->mask) + size != (end & md->mask)) {
> - buf = &data[start & md->mask];
> - size = md->mask + 1 - (start & md->mask);
> - start += size;
> -
> - if (record__write(rec, buf, size) < 0) {
> - rc = -1;
> - goto out;
> - }
> - }
> + if (perf_mmap__read_to_file(&read, rec->session->file) < 0)
> + return -1;
>
> - buf = &data[start & md->mask];
> - size = end - start;
> - start += size;
> + update_output_size(rec, read.size);
>
> - if (record__write(rec, buf, size) < 0) {
> - rc = -1;
> - goto out;
> - }
> -
> - md->prev = head;
> perf_mmap__consume(md, overwrite || backward);
> -out:
> - return rc;
> +
> + return 0;
> }
>
> static volatile int done;
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index 6a0d7ff..33b8837 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -704,6 +704,117 @@ static int perf_evlist__resume(struct perf_evlist *evlist)
> return perf_evlist__set_paused(evlist, false);
> }
>
> +static int
> +backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
> +{
> + struct perf_event_header *pheader;
> + u64 evt_head = head;
> + int size = mask + 1;
> +
> + pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
> + pheader = (struct perf_event_header *)(buf + (head & mask));
> + *start = head;
> + while (true) {
> + if (evt_head - head >= (unsigned int)size) {
> + pr_debug("Finished reading backward ring buffer: rewind\n");
> + if (evt_head - head > (unsigned int)size)
> + evt_head -= pheader->size;
> + *end = evt_head;
> + return 0;
> + }
> +
> + pheader = (struct perf_event_header *)(buf + (evt_head & mask));
> +
> + if (pheader->size == 0) {
> + pr_debug("Finished reading backward ring buffer: get start\n");
> + *end = evt_head;
> + return 0;
> + }
> +
> + evt_head += pheader->size;
> + pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
> + }
> + WARN_ONCE(1, "Shouldn't get here\n");
> + return -1;
> +}
> +
> +static int
> +rb_find_range(void *data, int mask, u64 head, u64 old,
> + u64 *start, u64 *end, bool backward)
> +{
> + if (!backward) {
> + *start = old;
> + *end = head;
> + return 0;
> + }
> +
> + return backward_rb_find_range(data, mask, head, start, end);
> +}
> +
> +/*
> + * Initialize struct perf_mmap_read
> + * Calculate the range of ring buffer to read
> + */
> +int perf_mmap__read_init(struct perf_mmap *md, struct perf_mmap_read *read,
> + bool overwrite, bool backward)
> +{
> + unsigned char *data = md->base + page_size;
> +
> + read->md = md;
> + read->head = perf_mmap__read_head(md);
> + read->overwrite = overwrite;
> + read->backward = backward;
> + read->size = 0;
> + return rb_find_range(data, md->mask, read->head, md->prev,
> + &read->start, &read->end, backward);
> +}
> +
> +/*
> + * Read the ring buffer in the range which specified in struct perf_mmap_read,
> + * and write to file.
> + * Used by perf record.
> + */
> +int perf_mmap__read_to_file(struct perf_mmap_read *read,
> + struct perf_data_file *file)
> +{
> + struct perf_mmap *md = read->md;
> + unsigned char *data = md->base + page_size;
> + unsigned long size;
> + void *buf;
> +
> + size = read->end - read->start;
> + if (size > (unsigned long)(md->mask) + 1) {
> + WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
> + read->size = 0;
> + goto out;
> + }
> +
> + if ((read->start & md->mask) + size != (read->end & md->mask)) {
> + buf = &data[read->start & md->mask];
> + size = md->mask + 1 - (read->start & md->mask);
> + read->start += size;
> + read->size += size;
> +
> + if (perf_data_file__write(file, buf, size) < 0) {
> + pr_err("failed to write %s, error: %m\n", file->path);
> + return -1;
> + }
> + }
> +
> + buf = &data[read->start & md->mask];
> + size = read->end - read->start;
> + read->start += size;
> + read->size += size;
> +
> + if (perf_data_file__write(file, buf, size) < 0) {
> + pr_err("failed to write %s, error: %m\n", file->path);
> + return -1;
> + }
> +out:
> + md->prev = read->head;
> + return 0;
> +}
> +
> /* When check_messup is true, 'end' must points to a good entry */
> static union perf_event *
> perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start,
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index b1c14f1..1ce4857 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -39,6 +39,16 @@ struct perf_mmap {
> char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
> };
>
> +struct perf_mmap_read {
> + struct perf_mmap *md;
> + u64 head;
> + u64 start;
> + u64 end;
> + bool overwrite;
> + bool backward;
> + unsigned long size;
> +};
> +
> static inline size_t
> perf_mmap__mmap_len(struct perf_mmap *map)
> {
> @@ -193,6 +203,11 @@ void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
>
> void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
>
> +int perf_mmap__read_init(struct perf_mmap *md, struct perf_mmap_read *read,
> + bool overwrite, bool backward);
> +int perf_mmap__read_to_file(struct perf_mmap_read *read,
> + struct perf_data_file *file);
> +
> int perf_evlist__open(struct perf_evlist *evlist);
> void perf_evlist__close(struct perf_evlist *evlist);
>
> --
> 2.5.5