Re: [PATCH] perf/sdt: Directly record cached SDT events

From: Masami Hiramatsu
Date: Sat Apr 30 2016 - 08:36:45 EST


Hi Hemant,

On Fri, 29 Apr 2016 19:10:41 +0530
Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx> wrote:

> This patch adds support for directly recording SDT events which are
> present in the probe cache. This patch is based on current SDT
> enablement patchset (v5) by Masami :
> https://lkml.org/lkml/2016/4/27/828
> and it implements two points in the TODO list mentioned in the
> cover note :
> "- (perf record) Support SDT event recording directly"
> "- (perf record) Try to unregister SDT events after record."
>
> Without this patch, we could probe into SDT events using
> "perf probe" and "perf record". With this patch, we can probe
> the SDT events directly using "perf record".

Thanks! However, before looking over each part of this patch,
I think this is not enough for supporting SDT for perf record.

If there are several SDTs which have same eventname but differnt
addresses (e.g. libc:memory_memalign_retry), how are those handled?
Currently, to support this, we'll need to enable those events
in different names, or just pick one of them. It could confuse
users in each case.

To solve this issue, we need to introduce multiple SDTs on single
ftrace event. Please read my comment on v3 patch (https://lkml.org/lkml/2015/8/15/52)

So, at this point, I've decided to introduce only perf-probe side.
If user want to record SDT, they can use perf-probe to add SDT events
and see what events are generated by SDT, so that they can specify those
events via perf-record.
e.g.

# perf probe -a %sdt_libc:*
...
sdt_libc:lll_futex_wake_1 (on %* in /usr/lib64/libc-2.20.so)
sdt_libc:lll_lock_wait_private (on %* in /usr/lib64/libc-2.20.so)

You can now use it in all perf tools, such as:

perf record -e sdt_libc:lll_lock_wait_private -aR sleep 1

# perf record -e sdt_libc:* -a
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.461 MB perf.data (6195 samples) ]

# perf script
plugin_audio_th 11119 [000] 16059.864654: sdt_libc:setjmp: (7f37bf55b6c1)
chrome 4650 [000] 16059.881345: sdt_libc:setjmp: (7f37bf55b6c1)
chrome 4650 [000] 16059.881350: sdt_libc:setjmp: (7f37bf55b6c1)
chrome 4650 [000] 16059.881411: sdt_libc:setjmp: (7f37bf55b6c1)
...

BTW, see below comments on the code for implementation issues.

[...]
> diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
> index 4a86aea..560cce3 100644
> --- a/tools/perf/builtin-probe.c
> +++ b/tools/perf/builtin-probe.c
> @@ -322,7 +322,7 @@ static int perf_add_probe_events(struct perf_probe_event *pevs, int npevs)
> if (ret < 0)
> return ret;
>
> - ret = convert_perf_probe_events(pevs, npevs);
> + ret = convert_perf_probe_events(pevs, npevs, false);
> if (ret < 0)
> goto out_cleanup;
>
> @@ -387,7 +387,7 @@ static int del_perf_probe_caches(struct strfilter *filter)
> return 0;
> }
>
> -static int perf_del_probe_events(struct strfilter *filter)
> +static int perf_del_probe_events(struct strfilter *filter, bool show)
> {
> int ret, ret2, ufd = -1, kfd = -1;
> char *str = strfilter__string(filter);
> @@ -417,7 +417,8 @@ static int perf_del_probe_events(struct strfilter *filter)
> ret = probe_file__get_events(kfd, filter, klist);
> if (ret == 0) {
> strlist__for_each(ent, klist)
> - pr_info("Removed event: %s\n", ent->s);
> + if (show)
> + pr_info("Removed event: %s\n", ent->s);

No, please don't do this. we have del_perf_probe_events(struct strfilter *filter)
for internal and silent version.


> ret = probe_file__del_strlist(kfd, klist);
> if (ret < 0)
> @@ -427,7 +428,8 @@ static int perf_del_probe_events(struct strfilter *filter)
> ret2 = probe_file__get_events(ufd, filter, ulist);
> if (ret2 == 0) {
> strlist__for_each(ent, ulist)
> - pr_info("Removed event: %s\n", ent->s);
> + if (show)
> + pr_info("Removed event: %s\n", ent->s);
>
> ret2 = probe_file__del_strlist(ufd, ulist);
> if (ret2 < 0)
> @@ -452,6 +454,38 @@ out:
> return ret;
> }
>
> +/*
> + * Record session for SDT events has ended. Delete the SDT events
> + * from uprobe_events file that were created initially.
> + */
> +void remove_sdt_event_list(struct list_head *sdt_events)
> +{
> + struct sdt_event_list *event;
> + struct strfilter *filter = NULL;
> + const char *err = NULL;
> + int ret = 0;
> +
> + if (list_empty(sdt_events))
> + return;
> +
> + list_for_each_entry(event, sdt_events, list) {
> + if (!filter) {
> + filter = strfilter__new(event->event_name, &err);
> + if (!filter)
> + goto free_list;
> + } else {
> + ret = strfilter__or(filter, event->event_name, &err);
> + }
> + }
> +
> + ret = perf_del_probe_events(filter, false);
> + if (ret)
> + pr_err("Error in deleting the SDT list\n");
> +
> +free_list:
> + free_sdt_list(sdt_events);
> +}

And, this has to move to util/probe-event.c

[...]
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 4033dce..cc017a5 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -1687,7 +1687,41 @@ int parse_events_option(const struct option *opt, const char *str,
> {
> struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
> struct parse_events_error err = { .idx = 0, };
> - int ret = parse_events(evlist, str, &err);
> + int ret = 0;
> +
> + if (*str == '%' && is_cmd_record()) {
> + char *ptr = NULL;
> + struct list_head *sdt_list;
> +
> + ptr = zalloc(strlen(str));
> + if (ptr == NULL)
> + return -ENOMEM;
> + strcpy(ptr, str);
> +
> + sdt_list = zalloc(sizeof(*sdt_list));
> + if (!sdt_list) {
> + free(ptr);
> + pr_err("Error in sdt_list memory allocation\n");
> + return -ENOMEM;
> + }
> + INIT_LIST_HEAD(sdt_list);
> +
> + /*
> + * If there is an error in this call, no need to free
> + * up sdt_list, its already free'ed up in the previous
> + * call. Free up 'ptr' though.
> + */
> + ret = add_sdt_event(ptr, sdt_list);
> + if (!ret)
> + sdt_event_list__add(sdt_list);
> + free(ptr);
> + }

This part should be another function, like parse_sdt_events().


> + if (!ret) {
> + ret = parse_events(evlist, str + 1, &err);
> + if (ret > 0)
> + ret = 0;
> + } else
> + ret = parse_events(evlist, str, &err);
>
> if (ret)
> parse_events_print_error(&err, str);
> diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> index c08daa9..98503c6 100644
> --- a/tools/perf/util/parse-events.h
> +++ b/tools/perf/util/parse-events.h
> @@ -189,4 +189,6 @@ int is_valid_tracepoint(const char *event_string);
> int valid_event_mount(const char *eventfs);
> char *parse_events_formats_error_string(char *additional_terms);
>
> +void sdt_event_list__add(struct list_head *sdt_event_list);
> +bool is_cmd_record(void);
> #endif /* __PERF_PARSE_EVENTS_H */
> diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
> index bb9fc34..34f6bab 100644
> --- a/tools/perf/util/probe-event.c
> +++ b/tools/perf/util/probe-event.c
> @@ -1144,7 +1144,7 @@ err:
> return err;
> }
>
> -static int parse_perf_probe_event_name(char **arg, struct perf_probe_event *pev)
> +int parse_perf_probe_event_name(char **arg, struct perf_probe_event *pev)
> {
> char *ptr;
>
> @@ -3008,7 +3008,8 @@ out:
> }
>
> static int convert_to_probe_trace_events(struct perf_probe_event *pev,
> - struct probe_trace_event **tevs)
> + struct probe_trace_event **tevs,
> + bool rec_enabled)
> {
> int ret;
>
> @@ -3033,6 +3034,17 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
> ret = find_probe_trace_events_from_cache(pev, tevs);
> if (ret > 0)
> return ret; /* Found in probe cache */
> + else if (rec_enabled) {
> + /*
> + * Can't find in probe cache, and, since,
> + * recording for sdt events is only enabled for
> + * cached events, we need to return throwing an error.
> + */
> + pr_err("Can't find %s in cache.\n", pev->event);
> + pr_err("Use \"perf list sdt\" to see the available events\n");
> + return -EINVAL;
> + }
> +

No, please don't add this flag. Instead, you can show this message in the caller.
And also, you should use find_cached_events_all() for this purpose, since
this is not only for SDT.

[...]
> +/*
> + * Find the SDT event from the cache and if found add it/them
> + * to the uprobe_events file
> + */
> +int add_sdt_event(char *event, struct list_head *sdt_events)
> +{
> + struct perf_probe_event *pev;
> + int ret, i;
> + char *str = event + 1;
> + struct sdt_event_list *tmp;
> +
> + pev = zalloc(sizeof(*pev));
> + if (!pev)
> + return -ENOMEM;
> +
> + pev->sdt = true;
> + pev->uprobes = true;
> +
> + /*
> + * Parse str to find the group name and event name of
> + * the sdt event.
> + */
> + ret = parse_perf_probe_event_name(&str, pev);
> + if (ret) {
> + pr_err("Error in parsing sdt event %s\n", str);
> + free(pev);
> + return ret;
> + }
> +
> + probe_conf.max_probes = MAX_PROBES;
> + probe_conf.force_add = 1;
> +
> + /*
> + * Find the sdt event from the cache, only cached SDT
> + * events can be directly recorded.
> + */
> + ret = convert_perf_probe_events(pev, 1, true);
> + if (ret < 0) {
> + goto free_pev;
> + } else if (!ret) {
> + ret = apply_perf_probe_events(pev, 1);
> + if (ret) {
> + pr_err("Error in adding SDT event : %s\n", event);
> + goto free_pev;
> + }
> + }
> +
> + if (pev->ntevs) {
> + /* Add the event name to "sdt_events" */

I think this should be done before applying probe events, so that
easy to roll it back.

> + for (i = 0; i < pev->ntevs; i++) {
> + tmp = zalloc(sizeof(*tmp));
> + if (!tmp) {
> + ret = -ENOMEM;
> + goto free_pev;
> + }
> + INIT_LIST_HEAD(&tmp->list);
> + tmp->event_name = strdup(pev->tevs[i].event);

What? would you only care the event name?

> + if (!tmp->event_name) {
> + free_sdt_list(sdt_events);
> + ret = -ENOMEM;
> + goto free_pev;
> + }
> + list_add(&tmp->list, sdt_events);
> + }
> + }
> +
> +free_pev:
> + cleanup_perf_probe_events(pev, 1);
> + return 0;
> +}
> diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h
> index a02bbbd..f120614 100644
> --- a/tools/perf/util/probe-file.h
> +++ b/tools/perf/util/probe-file.h
> @@ -32,6 +32,11 @@ struct probe_cache {
> struct list_head list;
> };
>
> +struct sdt_event_list {
> + char *event_name;

This should hold the filter pattern itself.
If possible, it should be the actual command which you've write to
probe-file (kprobe_events/uprobe_events in ftrace)

Thank you,

--
Masami Hiramatsu <mhiramat@xxxxxxxxxx>