Re: [PATCH v2 3/4] perf-report: add --max-stack option to limitcallchain stack scan

From: Arnaldo Carvalho de Melo
Date: Fri Oct 18 2013 - 13:17:42 EST


Em Fri, Oct 18, 2013 at 10:38:48AM -0400, Waiman Long escreveu:
> When callgraph data was included in the perf data file, it may take a
> long time to scan all those data and merge them together especially
> if the stored callchains are long and the perf data file itself is
> large, like a Gbyte or so.
>
> The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
> This is a large value. Usually the callgraph data that developers are
> most interested in are the first few levels, the rests are usually
> not looked at.
>
> This patch adds a new --max-stack option to perf-report to limit the
> depth of callchain stack data to look at to reduce the time it takes
> for perf-report to finish its processing. It trades the presence of
> trailing stack information with faster speed.
>
> The following table shows the elapsed time of doing perf-report on a
> perf.data file of size 985,531,828 bytes.
>
> --max_stack Elapsed Time Output data size
> ----------- ------------ ----------------

Please prefix lines like this (------) with a space, otherwise 'git am'
will chop off everything from that line onwards. Fixing it up now.

- Arnaldo

> not set 88.0s 124,422,651
> 64 87.5s 116,303,213
> 32 87.2s 112,023,804
> 16 86.6s 94,326,380
> 8 59.9s 33,697,248
> 4 40.7s 10,116,637
> -g none 27.1s 2,555,810
>
> Signed-off-by: Waiman Long <Waiman.Long@xxxxxx>
> ---
> tools/perf/Documentation/perf-report.txt | 8 ++++++++
> tools/perf/builtin-report.c | 22 +++++++++++++++++-----
> tools/perf/builtin-top.c | 3 ++-
> tools/perf/util/machine.c | 14 +++++++++-----
> tools/perf/util/machine.h | 3 ++-
> tools/perf/util/session.c | 3 ++-
> 6 files changed, 40 insertions(+), 13 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
> index 2b8097e..be3f196 100644
> --- a/tools/perf/Documentation/perf-report.txt
> +++ b/tools/perf/Documentation/perf-report.txt
> @@ -135,6 +135,14 @@ OPTIONS
>
> Default: fractal,0.5,callee,function.
>
> +--max-stack::
> + Set the stack depth limit when parsing the callchain, anything
> + beyond the specified depth will be ignored. This is a trade-off
> + between information loss and faster processing especially for
> + workloads that can have a very long callchain stack.
> +
> + Default: 127
> +
> -G::
> --inverted::
> alias for inverted caller based call graph.
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 72eae74..d0c9504 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -47,6 +47,7 @@ struct perf_report {
> bool show_threads;
> bool inverted_callchain;
> bool mem_mode;
> + int max_stack;
> struct perf_read_values show_threads_values;
> const char *pretty_printing_style;
> const char *cpu_list;
> @@ -88,7 +89,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
> if ((sort__has_parent || symbol_conf.use_callchain) &&
> sample->callchain) {
> err = machine__resolve_callchain(machine, evsel, al->thread,
> - sample, &parent, al);
> + sample, &parent, al,
> + rep->max_stack);
> if (err)
> return err;
> }
> @@ -179,7 +181,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
> if ((sort__has_parent || symbol_conf.use_callchain)
> && sample->callchain) {
> err = machine__resolve_callchain(machine, evsel, al->thread,
> - sample, &parent, al);
> + sample, &parent, al,
> + rep->max_stack);
> if (err)
> return err;
> }
> @@ -242,18 +245,21 @@ out:
> return err;
> }
>
> -static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
> +static int perf_evsel__add_hist_entry(struct perf_tool *tool,
> + struct perf_evsel *evsel,
> struct addr_location *al,
> struct perf_sample *sample,
> struct machine *machine)
> {
> + struct perf_report *rep = container_of(tool, struct perf_report, tool);
> struct symbol *parent = NULL;
> int err = 0;
> struct hist_entry *he;
>
> if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
> err = machine__resolve_callchain(machine, evsel, al->thread,
> - sample, &parent, al);
> + sample, &parent, al,
> + rep->max_stack);
> if (err)
> return err;
> }
> @@ -330,7 +336,8 @@ static int process_sample_event(struct perf_tool *tool,
> if (al.map != NULL)
> al.map->dso->hit = 1;
>
> - ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
> + ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
> + machine);
> if (ret < 0)
> pr_debug("problem incrementing symbol period, skipping event\n");
> }
> @@ -757,6 +764,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
> .ordered_samples = true,
> .ordering_requires_timestamps = true,
> },
> + .max_stack = PERF_MAX_STACK_DEPTH,
> .pretty_printing_style = "normal",
> };
> const struct option options[] = {
> @@ -797,6 +805,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
> OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
> "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
> "Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt),
> + OPT_INTEGER(0, "max-stack", &report.max_stack,
> + "Set the maximum stack depth when parsing the callchain, "
> + "anything beyond the specified depth will be ignored. "
> + "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
> OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
> "alias for inverted call graph"),
> OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 2122141..2725aca 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -771,7 +771,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
> sample->callchain) {
> err = machine__resolve_callchain(machine, evsel,
> al.thread, sample,
> - &parent, &al);
> + &parent, &al,
> + PERF_MAX_STACK_DEPTH);
> if (err)
> return;
> }
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 6188d28..9617c4a 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -1267,10 +1267,12 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> struct thread *thread,
> struct ip_callchain *chain,
> struct symbol **parent,
> - struct addr_location *root_al)
> + struct addr_location *root_al,
> + int max_stack)
> {
> u8 cpumode = PERF_RECORD_MISC_USER;
> - unsigned int i;
> + int chain_nr = min(max_stack, (int)chain->nr);
> + int i;
> int err;
>
> callchain_cursor_reset(&callchain_cursor);
> @@ -1280,7 +1282,7 @@ static int machine__resolve_callchain_sample(struct machine *machine,
> return 0;
> }
>
> - for (i = 0; i < chain->nr; i++) {
> + for (i = 0; i < chain_nr; i++) {
> u64 ip;
> struct addr_location al;
>
> @@ -1352,12 +1354,14 @@ int machine__resolve_callchain(struct machine *machine,
> struct thread *thread,
> struct perf_sample *sample,
> struct symbol **parent,
> - struct addr_location *root_al)
> + struct addr_location *root_al,
> + int max_stack)
> {
> int ret;
>
> ret = machine__resolve_callchain_sample(machine, thread,
> - sample->callchain, parent, root_al);
> + sample->callchain, parent,
> + root_al, max_stack);
> if (ret)
> return ret;
>
> diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
> index 58a6be1..d09cce0 100644
> --- a/tools/perf/util/machine.h
> +++ b/tools/perf/util/machine.h
> @@ -91,7 +91,8 @@ int machine__resolve_callchain(struct machine *machine,
> struct thread *thread,
> struct perf_sample *sample,
> struct symbol **parent,
> - struct addr_location *root_al);
> + struct addr_location *root_al,
> + int max_stack);
>
> /*
> * Default guest kernel is defined by parameter --guestkallsyms
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 568b750..96e5449 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1525,7 +1525,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
> if (symbol_conf.use_callchain && sample->callchain) {
>
> if (machine__resolve_callchain(machine, evsel, al.thread,
> - sample, NULL, NULL) != 0) {
> + sample, NULL, NULL,
> + PERF_MAX_STACK_DEPTH) != 0) {
> if (verbose)
> error("Failed to resolve callchain. Skipping\n");
> return;
> --
> 1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/