RE: [PATCH 3/4] perf record: event synthesization multithreading support

From: Liang, Kan
Date: Fri Oct 13 2017 - 10:58:49 EST


> Em Fri, Oct 13, 2017 at 07:09:26AM -0700, kan.liang@xxxxxxxxx escreveu:
> > From: Kan Liang <Kan.liang@xxxxxxxxx>
> >
> > The process function process_synthesized_event writes the process
> > result to perf.data, which is not multithreading friendly.
> >
> > Realloc buffer for each thread to temporarily keep the processing
> > result. Write them to the perf.data at the end of event synthesization.
> > The new method doesn't impact the final result, because
> > - The order of the synthesized event is not important.
> > - The number of synthesized event is limited. Usually, it only needs
> > hundreds of Kilobyte to store all the synthesized event.
> > It's unlikly failed because of lack of memory.
>
> Why not write to a per cpu file and then at the end merge them?

I just thought merging from memory should be faster than merging from files.

> Leave
> the memory management to the kernel, i.e. in most cases you may even not
> end up touching the disk, when memory is plentiful, just rewind the per
> event files and go on dumping to the main perf.data file.
>

Agree. I will do it.

> At some point we may just don't do this merging, and keep per cpu files
> all the way to perf report, etc. This would be a first foray into
> that...
>

Yes, but it's a little bit complex.
I think I will do it in a separate improvement patch series later.
For now, I will do the file merge.

Thanks,
Kan

> - Arnaldo
>
> > The threads number hard code to online CPU number. The following patch
> > will introduce an option to set it.
> >
> > The multithreading synthesize is only available for per cpu monitoring.
> >
> > Signed-off-by: Kan Liang <Kan.liang@xxxxxxxxx>
> > ---
> > tools/perf/builtin-record.c | 86
> ++++++++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 81 insertions(+), 5 deletions(-)
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index 4ede9bf..1978021 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -62,6 +62,11 @@ struct switch_output {
> > bool set;
> > };
> >
> > +struct synthesize_buf {
> > + void *entry;
> > + size_t size;
> > +};
> > +
> > struct record {
> > struct perf_tool tool;
> > struct record_opts opts;
> > @@ -80,6 +85,7 @@ struct record {
> > bool timestamp_filename;
> > struct switch_output switch_output;
> > unsigned long long samples;
> > + struct synthesize_buf *buf;
> > };
> >
> > static volatile int auxtrace_record__snapshot_started;
> > @@ -120,14 +126,35 @@ static int record__write(struct record *rec, void
> *bf, size_t size)
> > return 0;
> > }
> >
> > +static int buf__write(struct record *rec, int idx, void *bf, size_t size)
> > +{
> > + size_t target_size = rec->buf[idx].size;
> > + void *target;
> > +
> > + target = realloc(rec->buf[idx].entry, target_size + size);
> > + if (target == NULL)
> > + return -ENOMEM;
> > +
> > + memcpy(target + target_size, bf, size);
> > +
> > + rec->buf[idx].entry = target;
> > + rec->buf[idx].size += size;
> > +
> > + return 0;
> > +}
> > +
> > static int process_synthesized_event(struct perf_tool *tool,
> > union perf_event *event,
> > struct perf_sample *sample
> __maybe_unused,
> > struct machine *machine
> __maybe_unused,
> > - struct thread_info *thread
> __maybe_unused)
> > + struct thread_info *thread)
> > {
> > struct record *rec = container_of(tool, struct record, tool);
> > - return record__write(rec, event, event->header.size);
> > +
> > + if (!perf_singlethreaded && thread)
> > + return buf__write(rec, thread->idx, event, event-
> >header.size);
> > + else
> > + return record__write(rec, event, event->header.size);
> > }
> >
> > static int record__pushfn(void *to, void *bf, size_t size)
> > @@ -690,6 +717,48 @@ static const struct perf_event_mmap_page
> *record__pick_pc(struct record *rec)
> > return NULL;
> > }
> >
> > +static int record__multithread_synthesize(struct record *rec,
> > + struct machine *machine,
> > + struct perf_tool *tool,
> > + struct record_opts *opts)
> > +{
> > + int i, err, nr_thread = sysconf(_SC_NPROCESSORS_ONLN);
> > +
> > + if (nr_thread > 1) {
> > + perf_set_multithreaded();
> > +
> > + rec->buf = calloc(nr_thread, sizeof(struct synthesize_buf));
> > + if (rec->buf == NULL) {
> > + pr_err("Could not do multithread synthesize\n");
> > + nr_thread = 1;
> > + perf_set_singlethreaded();
> > + }
> > + }
> > +
> > + err = __machine__synthesize_threads(machine, tool, &opts->target,
> > + rec->evlist->threads,
> > + process_synthesized_event,
> > + opts->sample_address,
> > + opts->proc_map_timeout,
> nr_thread);
> > + if (err < 0)
> > + goto free;
> > +
> > + if (nr_thread > 1) {
> > + for (i = 0; i < nr_thread; i++)
> > + record__write(rec, rec->buf[i].entry, rec->buf[i].size);
> > + }
> > +
> > +free:
> > + if (nr_thread > 1) {
> > + for (i = 0; i < nr_thread; i++)
> > + free(rec->buf[i].entry);
> > + free(rec->buf);
> > + perf_set_singlethreaded();
> > + }
> > +
> > + return err;
> > +}
> > +
> > static int record__synthesize(struct record *rec, bool tail)
> > {
> > struct perf_session *session = rec->session;
> > @@ -766,9 +835,16 @@ static int record__synthesize(struct record *rec,
> bool tail)
> > perf_event__synthesize_guest_os,
> tool);
> > }
> >
> > - err = __machine__synthesize_threads(machine, tool, &opts->target,
> rec->evlist->threads,
> > - process_synthesized_event, opts-
> >sample_address,
> > - opts->proc_map_timeout, 1);
> > + /* multithreading synthesize is only available for cpu monitoring */
> > + if (target__has_cpu(&opts->target))
> > + err = record__multithread_synthesize(rec, machine, tool,
> opts);
> > + else
> > + err = __machine__synthesize_threads(machine, tool,
> > + &opts->target,
> > + rec->evlist->threads,
> > + process_synthesized_event,
> > + opts->sample_address,
> > + opts->proc_map_timeout,
> 1);
> > out:
> > return err;
> > }
> > --
> > 2.7.4