Re: [PATCH 1/5] perf trace: add support for pagefault tracing

From: Arnaldo Carvalho de Melo
Date: Wed Jun 18 2014 - 11:48:01 EST


First: thanks for working on this!

Em Wed, Jun 18, 2014 at 06:59:21PM +0400, Stanislav Fomichev escreveu:
> Signed-off-by: Stanislav Fomichev <stfomichev@xxxxxxxxxxxxxx>

No single line on the changelog? Please add a description here, with
examples of it in use, etc.

Also you're doing various things on the same patch, please read on to
see how it should be broken into multiple changesets.

> ---
> tools/perf/Documentation/perf-trace.txt | 12 ++
> tools/perf/builtin-trace.c | 198 ++++++++++++++++++++++++++------
> 2 files changed, 176 insertions(+), 34 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index fae38d9a44a4..7da5f75a45f1 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -107,6 +107,18 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
> Show tool stats such as number of times fd->pathname was discovered thru
> hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.
>
> +-f::
> +--pgfaults::
> + Trace major pagefaults. To also trace minor pagefaults, specify this
> + option twice.

I try, when possible, to not use short options that are used in
'strace', so what if we use 'F' here?

And:

trace --pgfaults --pgfaults

for major and min page faults looks ugly, what if we instead use --pf
for both, and allow passing min or maj as args?

I.e.:

For both major and minor:

trace --pf

for just major page faults:

trace --pf maj

> +
> +EXAMPLES
> +--------
> +
> +Trace syscalls, major and minor pagefaults:
> +
> + $ perf trace -f -f
> +
> SEE ALSO
> --------
> linkperf:perf-record[1], linkperf:perf-script[1]
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index f954c26de231..6c7ae048db59 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -1211,6 +1211,7 @@ struct trace {
> bool summary_only;
> bool show_comm;
> bool show_tool_stats;
> + int trace_pgfaults;

uint8_t should be enough

> };
>
> static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
> @@ -1534,7 +1535,9 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
> return printed;
> }
>
> -typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
> +typedef int (*tracepoint_handler)(struct trace *trace,
> + union perf_event *event,
> + struct perf_evsel *evsel,
> struct perf_sample *sample);

You'll reduce patch size if you leave the first line alone and just add
the new parameter (event) after evsel.

It'll become then:

typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
+ union perf_event *event,
struct perf_sample *sample);

Then please make one separate patch adding this new parameter, stating
that it will be used by pagefault tracing, that will come in a followup
patch in this series.

>
> static struct syscall *trace__syscall_info(struct trace *trace,
> @@ -1606,7 +1609,9 @@ static void thread__update_stats(struct thread_trace *ttrace,
> update_stats(stats, duration);
> }
>
> -static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
> +static int trace__sys_enter(struct trace *trace,
> + union perf_event *event __maybe_unused,
> + struct perf_evsel *evsel,
> struct perf_sample *sample)
> {
> char *msg;
> @@ -1655,7 +1660,9 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
> return 0;
> }
>
> -static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
> +static int trace__sys_exit(struct trace *trace,
> + union perf_event *event __maybe_unused,
> + struct perf_evsel *evsel,
> struct perf_sample *sample)
> {
> int ret;
> @@ -1734,14 +1741,18 @@ out:
> return 0;
> }
>
> -static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
> +static int trace__vfs_getname(struct trace *trace,
> + union perf_event *event __maybe_unused,
> + struct perf_evsel *evsel,
> struct perf_sample *sample)
> {
> trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
> return 0;
> }
>
> -static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
> +static int trace__sched_stat_runtime(struct trace *trace,
> + union perf_event *event __maybe_unused,
> + struct perf_evsel *evsel,
> struct perf_sample *sample)
> {
> u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
> @@ -1768,6 +1779,69 @@ out_dump:
> return 0;
> }
>
> +static bool valid_dso(struct addr_location *al, struct perf_sample *sample)

Humm, can't this be reduced to just:

return al->map && al->map->dso;

? I.e. if the helper returned a dso, it is because the address that was
looked up is in that dso, no?

I even guess that if there is al->map, that should be enough to check,
byt haven't thought this thru nor looked at the relevant sources, in a
hurry now.

> +{
> + if (al->map && al->map->dso &&
> + al->map->start <= sample->addr &&
> + al->map->end > sample->addr)
> + return true;
> + else
> + return false;
> +}
> +
> +static int trace__pgfault(struct trace *trace,
> + union perf_event *event,
> + struct perf_evsel *evsel,
> + struct perf_sample *sample)
> +{
> + struct thread *thread;
> + u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
> + struct addr_location al;
> + char map_type = 'd';
> +
> + thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
> +
> + thread__find_addr_location(thread, trace->host, cpumode, MAP__FUNCTION,
> + sample->ip, &al);
> +
> + trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
> +
> + fprintf(trace->output, "%sfault ",
> + evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
> + "maj" : "min");
> +
> + if (al.sym)
> + fprintf(trace->output, "[%s+0x%lx]",
> + al.sym->name, al.addr - al.sym->start);
> + else
> + fprintf(trace->output, "[0x%lx]", sample->ip);
> +
> + fprintf(trace->output, " => ");
> +
> + thread__find_addr_map(thread, trace->host, cpumode, MAP__VARIABLE,
> + sample->addr, &al);
> +
> + if (!valid_dso(&al, sample)) {
> + thread__find_addr_map(thread, trace->host, cpumode,
> + MAP__FUNCTION, sample->addr, &al);
> +
> + if (valid_dso(&al, sample))
> + map_type = 'x';
> + }
> +
> + if (valid_dso(&al, sample)) {
> + fprintf(trace->output, "%s+0x%lx",
> + al.map->dso->long_name, sample->addr - al.map->start);
> + } else {
> + map_type = '?';
> + fprintf(trace->output, "0x%lx", sample->addr);
> + }
> +
> + fprintf(trace->output, " (%c%c)\n", map_type, al.level);
> +
> + return 0;
> +}
> +
> static bool skip_sample(struct trace *trace, struct perf_sample *sample)
> {
> if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
> @@ -1781,7 +1855,7 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample)
> }
>
> static int trace__process_sample(struct perf_tool *tool,
> - union perf_event *event __maybe_unused,
> + union perf_event *event,
> struct perf_sample *sample,
> struct perf_evsel *evsel,
> struct machine *machine __maybe_unused)
> @@ -1799,7 +1873,7 @@ static int trace__process_sample(struct perf_tool *tool,
>
> if (handler) {
> ++trace->nr_events;
> - handler(trace, evsel, sample);
> + handler(trace, event, evsel, sample);
> }
>
> return err;
> @@ -1826,7 +1900,7 @@ static int parse_target_str(struct trace *trace)
> return 0;
> }
>
> -static int trace__record(int argc, const char **argv)
> +static int trace__record(struct trace *trace, int argc, const char **argv)
> {
> unsigned int rec_argc, i, j;
> const char **rec_argv;
> @@ -1835,34 +1909,52 @@ static int trace__record(int argc, const char **argv)
> "-R",
> "-m", "1024",
> "-c", "1",
> - "-e",
> };
>
> + const char * const sc_args[] = { "-e", };
> + unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
> + const char * const majpf_args[] = { "-e", "major-faults" };
> + unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
> + const char * const minpf_args[] = { "-e", "minor-faults" };
> + unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
> +

Please separate adding page fault tracing recording on a file in a
separate patch from adding it to live mode, then the changelog message
can concentrate on examples for each feature.

> /* +1 is for the event string below */
> - rec_argc = ARRAY_SIZE(record_args) + 1 + argc;
> + rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
> + majpf_args_nr + minpf_args_nr + argc;
> rec_argv = calloc(rec_argc + 1, sizeof(char *));
>
> if (rec_argv == NULL)
> return -ENOMEM;
>
> + j = 0;
> for (i = 0; i < ARRAY_SIZE(record_args); i++)
> - rec_argv[i] = record_args[i];
> + rec_argv[j++] = record_args[i];
> +
> + for (i = 0; i < sc_args_nr; i++)
> + rec_argv[j++] = sc_args[i];
>
> /* event string may be different for older kernels - e.g., RHEL6 */
> if (is_valid_tracepoint("raw_syscalls:sys_enter"))
> - rec_argv[i] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
> + rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
> else if (is_valid_tracepoint("syscalls:sys_enter"))
> - rec_argv[i] = "syscalls:sys_enter,syscalls:sys_exit";
> + rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
> else {
> pr_err("Neither raw_syscalls nor syscalls events exist.\n");
> return -1;
> }
> - i++;
>
> - for (j = 0; j < (unsigned int)argc; j++, i++)
> - rec_argv[i] = argv[j];
> + if (trace->trace_pgfaults)
> + for (i = 0; i < majpf_args_nr; i++)
> + rec_argv[j++] = majpf_args[i];
> +
> + if (trace->trace_pgfaults > 1)
> + for (i = 0; i < minpf_args_nr; i++)
> + rec_argv[j++] = minpf_args[i];
>
> - return cmd_record(i, rec_argv, NULL);
> + for (i = 0; i < (unsigned int)argc; i++)
> + rec_argv[j++] = argv[i];
> +
> + return cmd_record(j, rec_argv, NULL);
> }
>
> static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
> @@ -1882,6 +1974,30 @@ static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
> perf_evlist__add(evlist, evsel);
> }
>
> +static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
> + u64 config)
> +{
> + struct perf_evsel *evsel;
> + struct perf_event_attr attr = {
> + .type = PERF_TYPE_SOFTWARE,
> + .mmap_data = 1,
> + .sample_period = 1,
> + };
> +
> + attr.config = config;
> +
> + event_attr_init(&attr);
> +
> + evsel = perf_evsel__new(&attr);
> + if (!evsel)
> + return -ENOMEM;
> +
> + evsel->handler = trace__pgfault;
> + perf_evlist__add(evlist, evsel);
> +
> + return 0;
> +}
> +
> static int trace__run(struct trace *trace, int argc, const char **argv)
> {
> struct perf_evlist *evlist = perf_evlist__new();
> @@ -1902,6 +2018,14 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
> perf_evlist__add_vfs_getname(evlist);
>
> + if (trace->trace_pgfaults &&
> + perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ))
> + goto out_error_tp;
> +
> + if (trace->trace_pgfaults > 1 &&
> + perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
> + goto out_error_tp;
> +
> if (trace->sched &&
> perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
> trace__sched_stat_runtime))
> @@ -1982,7 +2106,8 @@ again:
> goto next_event;
> }
>
> - if (sample.raw_data == NULL) {
> + if (evsel->attr.type != PERF_TYPE_SOFTWARE &&
> + sample.raw_data == NULL) {

This looks like a separate patch with relevant associated changelog
message explaining why this is needed.

> fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
> perf_evsel__name(evsel), sample.tid,
> sample.cpu, sample.raw_size);
> @@ -1990,7 +2115,7 @@ again:
> }
>
> handler = evsel->handler;
> - handler(trace, evsel, &sample);
> + handler(trace, event, evsel, &sample);
> next_event:
> perf_evlist__mmap_consume(evlist, i);
>
> @@ -2093,13 +2218,10 @@ static int trace__replay(struct trace *trace)
> if (evsel == NULL)
> evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
> "syscalls:sys_enter");
> - if (evsel == NULL) {
> - pr_err("Data file does not have raw_syscalls:sys_enter event\n");
> - goto out;
> - }
>
> - if (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
> - perf_evsel__init_sc_tp_ptr_field(evsel, args)) {
> + if (evsel &&
> + (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
> + perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
> pr_err("Error during initialize raw_syscalls:sys_enter event\n");
> goto out;

the above can be ditched, not needed. Care to explain if you think
otherwise?

> }
> @@ -2109,15 +2231,19 @@ static int trace__replay(struct trace *trace)
> if (evsel == NULL)
> evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
> "syscalls:sys_exit");
> - if (evsel == NULL) {
> - pr_err("Data file does not have raw_syscalls:sys_exit event\n");
> + if (evsel &&
> + (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
> + perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
> + pr_err("Error during initialize raw_syscalls:sys_exit event\n");
> goto out;

Dittoi

> }
>
> - if (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
> - perf_evsel__init_sc_tp_uint_field(evsel, ret)) {
> - pr_err("Error during initialize raw_syscalls:sys_exit event\n");
> - goto out;
> + evlist__for_each(session->evlist, evsel) {
> + if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
> + (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
> + evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
> + evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
> + evsel->handler = trace__pgfault;

the above looks ugly, can't we set the handler when adding the events?
Or is this just for the perf.data handling case? Have to dig deeper,
just a first feeling.

> }
>
> err = parse_target_str(trace);
> @@ -2290,6 +2416,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> .user_interval = ULLONG_MAX,
> .no_buffering = true,
> .mmap_pages = 1024,
> + .sample_address = true,
> + .sample_time = true,

The above should be made conditional, i.e. if --pf is used?

> },
> .output = stdout,
> .show_comm = true,
> @@ -2330,15 +2458,17 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> "Show only syscall summary with statistics"),
> OPT_BOOLEAN('S', "with-summary", &trace.summary,
> "Show all syscalls and summary with statistics"),
> + OPT_INCR('f', "pgfaults", &trace.trace_pgfaults, "Trace pagefaults"),
> OPT_END()
> };
> int err;
> char bf[BUFSIZ];
>
> - if ((argc > 1) && (strcmp(argv[1], "record") == 0))
> - return trace__record(argc-2, &argv[2]);
> + argc = parse_options(argc, argv, trace_options, trace_usage,
> + PARSE_OPT_STOP_AT_NON_OPTION);
>
> - argc = parse_options(argc, argv, trace_options, trace_usage, 0);
> + if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
> + return trace__record(&trace, argc-1, &argv[1]);
>
> /* summary_only implies summary option, but don't overwrite summary if set */
> if (trace.summary_only)
> --
> 1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/