Re: [PATCH v4] [RFC] trace: Add kprobe on tracepoint

From: Masami Hiramatsu
Date: Thu Aug 12 2021 - 11:44:53 EST


Hi,

Here is my code review.

On Wed, 11 Aug 2021 17:14:33 +0300
"Tzvetomir Stoyanov (VMware)" <tz.stoyanov@xxxxxxxxx> wrote:
[...]
> +
> +static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
> +{
> + return container_of(ev, struct trace_eprobe, devent);
> +}
> +
> +static int trace_eprobe_find(struct trace_eprobe *ep)

This function name is a bit easy to mislead. If I were you,
I call it 'trace_eprobe_setup_event()'.
Or, I'll make it returns 'struct trace_event_call *' and set
ep->event outside of this function. Moreover, I recommend you
to call this before alloc_event_probe() and pass the result to it.
e.g.

... /* parse the target system and event */

event_call = find_and_get_event(sys_name, sys_event);
if (!event_call)
goto error;

ep = alloc_event_probe(group, event, sys_name, sys_event, event_call, argc - 2);
if (IS_ERR(ep))
...

> +{
> + struct trace_event_call *tp_event;
> + int ret = -ENOENT;
> + const char *name;
> +
> + mutex_lock(&event_mutex);
> + list_for_each_entry(tp_event, &ftrace_events, list) {
> + if (tp_event->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
> + continue;
> + if (!tp_event->class->system ||
> + strcmp(ep->event_system, tp_event->class->system))
> + continue;
> + name = trace_event_name(tp_event);
> + if (!name ||
> + strcmp(ep->event_name, name))
> + continue;
> + if (!try_module_get(tp_event->mod)) {
> + ret = -ENODEV;
> + break;
> + }

BTW, this can lock the static events (because the module in where the
event is locked), but can not lock the other dynamic events.
Maybe we need 2 more patches.

1) introduce TRACE_EVENT_FL_PROBE and set the flag in alloc_trace_k/uprobe().
(and eprobe will skip it)
2) introduce refcount for the trace_event_call, which prevent removing
synthetic event.


> + ep->event = tp_event;
> + ret = 0;
> + break;
> + }
> + mutex_unlock(&event_mutex);
> +
> + return ret;
> +}
> +
> +static int eprobe_dyn_event_create(const char *raw_command)
> +{
> + return trace_probe_create(raw_command, __trace_eprobe_create);
> +}
> +
[...]
> +
> +static struct trace_eprobe *alloc_event_probe(const char *group,
> + const char *event,
> + const char *sys_name,
> + const char *sys_event,
> + int nargs)
> +{
> + struct trace_eprobe *ep;
> + int ret = -ENOMEM;
> +
> + ep = kzalloc(SIZEOF_TRACE_EPROBE(nargs), GFP_KERNEL);
> + if (!ep)
> + goto error;
> + ep->event_name = kstrdup(sys_event, GFP_KERNEL);
> + if (!ep->event_name)
> + goto error;
> + ep->event_system = kstrdup(sys_name, GFP_KERNEL);
> + if (!ep->event_system)
> + goto error;
> +
> + ret = trace_probe_init(&ep->tp, event, group, false);
> + if (ret < 0)
> + goto error;
> +
> + dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
> + return ep;
> +error:
> + trace_event_probe_cleanup(ep);
> + return ERR_PTR(ret);
> +}
> +
> +static int trace_eprobe_tp_arg_find(struct trace_eprobe *ep, int i)

I think 'trace_eprobe_tp_arg_update()' will be better name.
Also, 'if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG)' check
is moved inside in this function for hiding inside of the tp.args.

> +{
> + struct probe_arg *parg = &ep->tp.args[i];
> + struct ftrace_event_field *field;
> + struct list_head *head;
> +
> + head = trace_get_fields(ep->event);
> + list_for_each_entry(field, head, link) {
> + if (!strcmp(parg->code->data, field->name)) {
> + kfree(parg->code->data);
> + parg->code->data = field;
> + return 0;
> + }
> + }
> + kfree(parg->code->data);
> + parg->code->data = NULL;
> + return -ENOENT;
> +}
> +
> +static int kprobe_event_define_fields(struct trace_event_call *event_call)

you meant 'eprobe_event_define_fields()' ? :)

> +{
> + int ret;
> + struct kprobe_trace_entry_head field;
> + struct trace_probe *tp;
> +
> + tp = trace_probe_primary_from_call(event_call);
> + if (WARN_ON_ONCE(!tp))
> + return -ENOENT;
> +
> + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);

Would you really need this 'ip' field? I think you can record the
original event ID (call->event.type) instead of ip.

> +
> + return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
> +}
> +
> +static struct trace_event_fields eprobe_fields_array[] = {
> + { .type = TRACE_FUNCTION_TYPE,
> + .define_fields = kprobe_event_define_fields },

Ditto.

> + {}
> +};
> +
> +/* Event entry printers */
> +static enum print_line_t
> +print_eprobe_event(struct trace_iterator *iter, int flags,
> + struct trace_event *event)
> +{
> + struct kprobe_trace_entry_head *field;
> + struct trace_seq *s = &iter->seq;
> + struct trace_probe *tp;
> +
> + field = (struct kprobe_trace_entry_head *)iter->ent;
> + tp = trace_probe_primary_from_call(
> + container_of(event, struct trace_event_call, event));
> + if (WARN_ON_ONCE(!tp))
> + goto out;
> +
> + trace_seq_printf(s, "%s: (", trace_probe_name(tp));
> +
> + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
> + goto out;

Here, you can show the original event name from the event ID.

> +
> + trace_seq_putc(s, ')');
> +
> + if (print_probe_args(s, tp->args, tp->nr_args,
> + (u8 *)&field[1], field) < 0)
> + goto out;
> +
> + trace_seq_putc(s, '\n');
> + out:
> + return trace_handle_return(s);
> +}
> +
> +static unsigned long get_event_field(struct fetch_insn *code, void *rec)
> +{
> + struct ftrace_event_field *field = code->data;
> + unsigned long val;
> + void *addr;
> +
> + addr = rec + field->offset;
> +
> + switch (field->size) {
> + case 1:
> + if (field->is_signed)
> + val = *(char *)addr;
> + else
> + val = *(unsigned char *)addr;
> + break;
> + case 2:
> + if (field->is_signed)
> + val = *(short *)addr;
> + else
> + val = *(unsigned short *)addr;
> + break;
> + case 4:
> + if (field->is_signed)
> + val = *(int *)addr;
> + else
> + val = *(unsigned int *)addr;
> + break;
> + default:
> + if (field->is_signed)
> + val = *(long *)addr;
> + else
> + val = *(unsigned long *)addr;
> + break;
> + }
> + return val;
> +}
> +
> +static int get_eprobe_size(struct trace_probe *tp, void *rec)
> +{
> + struct probe_arg *arg;
> + int i, len, ret = 0;
> +
> + for (i = 0; i < tp->nr_args; i++) {
> + arg = tp->args + i;
> + if (unlikely(arg->dynamic)) {
> + unsigned long val;
> +
> + val = get_event_field(arg->code, rec);
> + len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL);
> + if (len > 0)
> + ret += len;
> + }
> + }
> +
> + return ret;
> +}
> +
> +/* Kprobe specific fetch functions */
[...]
> +static nokprobe_inline int
> +probe_mem_read(void *dest, void *src, size_t size)
> +{
> +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> + if ((unsigned long)src < TASK_SIZE)
> + return probe_mem_read_user(dest, src, size);
> +#endif
> + return copy_from_kernel_nofault(dest, src, size);
> +}

Hmm, these "fetch_args for kernel" APIs should finally unified with
kprobe events. But at this step, this is good.

> +
> +/* eprobe handler */
> +static inline void
> +__eprobe_trace_func(struct eprobe_data *edata, void *rec)
> +{
> + struct kprobe_trace_entry_head *entry;
> + struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp);
> + struct trace_event_buffer fbuffer;
> + int dsize;
> +
> + WARN_ON(call != edata->file->event_call);
> +
> + if (trace_trigger_soft_disabled(edata->file))
> + return;
> +
> + fbuffer.trace_ctx = tracing_gen_ctx();
> + fbuffer.trace_file = edata->file;
> +
> + dsize = get_eprobe_size(&edata->ep->tp, rec);
> + fbuffer.regs = NULL;
> +
> + fbuffer.event =
> + trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file,
> + call->event.type,
> + sizeof(*entry) + edata->ep->tp.size + dsize,
> + fbuffer.trace_ctx);
> + if (!fbuffer.event)
> + return;
> +
> + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
> + entry->ip = 0;

Here, you can trace edata->ep->event->event.type instead of 0.

> + store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
> +
> + trace_event_buffer_commit(&fbuffer);
> +}
> +
> +/*
> + * The event probe implementation uses event triggers to get access to
> + * the event it is attached to, but is not an actual trigger. The below
> + * functions are just stubs to fulfill what is needed to use the trigger
> + * infrastructure.
> + */

OK, I got it. So eprobe is implemented on the trigger action framework.

[...]
> +
> +static int enable_eprobe(struct trace_eprobe *ep,
> + struct trace_event_file *eprobe_file)
> +{
> + struct event_trigger_data *trigger;
> + struct trace_event_file *file;
> + struct trace_array *tr = eprobe_file->tr;
> +
> + file = find_event_file(tr, ep->event_system, ep->event_name);
> + if (!file)
> + return -ENOENT;
> + trigger = new_eprobe_trigger(ep, eprobe_file);
> + if (IS_ERR(trigger))
> + return PTR_ERR(trigger);
> +
> + list_add_tail_rcu(&trigger->list, &file->triggers);
> +
> + trace_event_trigger_enable_disable(file, 1);
> + update_cond_flag(file);
> +
> + return 0;
> +}
> +
> +static struct trace_event_functions eprobe_funcs = {
> + .trace = print_eprobe_event
> +};
> +
> +static int disable_eprobe(struct trace_eprobe *ep,
> + struct trace_array *tr)
> +{
> + struct event_trigger_data *trigger;
> + struct trace_event_file *file;
> + struct eprobe_data *edata;
> +
> + file = find_event_file(tr, ep->event_system, ep->event_name);
> + if (!file)
> + return -ENOENT;
> +
> + list_for_each_entry(trigger, &file->triggers, list) {
> + if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
> + continue;
> + edata = trigger->private_data;
> + if (edata->ep == ep)
> + break;
> + }
> + if (list_entry_is_head(trigger, &file->triggers, list))
> + return -ENODEV;
> +
> + list_del_rcu(&trigger->list);
> +
> + trace_event_trigger_enable_disable(file, 0);
> + update_cond_flag(file);
> + return 0;
> +}
> +
> +static int enable_trace_eprobe(struct trace_event_call *call,
> + struct trace_event_file *file)
> +{
> + struct trace_probe *pos, *tp;
> + struct trace_eprobe *ep;
> + bool enabled;
> + int ret = 0;
> +
> + tp = trace_probe_primary_from_call(call);
> + if (WARN_ON_ONCE(!tp))
> + return -ENODEV;
> + enabled = trace_probe_is_enabled(tp);
> +
> + /* This also changes "enabled" state */
> + if (file) {
> + ret = trace_probe_add_file(tp, file);
> + if (ret)
> + return ret;
> + } else
> + trace_probe_set_flag(tp, TP_FLAG_PROFILE);
> +
> + if (enabled)
> + return 0;
> +
> + list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
> + ep = container_of(pos, struct trace_eprobe, tp);
> + ret = enable_eprobe(ep, file);
> + if (ret)
> + break;
> + enabled = true;
> + }
> +
> + if (ret) {
> + /* Failed to enable one of them. Roll back all */
> + if (enabled)
> + disable_eprobe(ep, file->tr);
> + if (file)
> + trace_probe_remove_file(tp, file);
> + else
> + trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
> + }
> +
> + return ret;
> +}
> +
> +static int disable_trace_eprobe(struct trace_event_call *call,
> + struct trace_event_file *file)
> +{
> + struct trace_probe *pos, *tp;
> + struct trace_eprobe *ep;
> +
> + tp = trace_probe_primary_from_call(call);
> + if (WARN_ON_ONCE(!tp))
> + return -ENODEV;
> +
> + if (file) {
> + if (!trace_probe_get_file_link(tp, file))
> + return -ENOENT;
> + if (!trace_probe_has_single_file(tp))
> + goto out;
> + trace_probe_clear_flag(tp, TP_FLAG_TRACE);
> + } else
> + trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
> +
> + if (!trace_probe_is_enabled(tp)) {
> + list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
> + ep = container_of(pos, struct trace_eprobe, tp);
> + disable_eprobe(ep, file->tr);
> + }
> + }
> +
> + out:
> + if (file)
> + /*
> + * Synchronization is done in below function. For perf event,
> + * file == NULL and perf_trace_event_unreg() calls
> + * tracepoint_synchronize_unregister() to ensure synchronize
> + * event. We don't need to care about it.
> + */
> + trace_probe_remove_file(tp, file);
> +
> + return 0;
> +}
> +
> +static int eprobe_register(struct trace_event_call *event,
> + enum trace_reg type, void *data)
> +{
> + struct trace_event_file *file = data;
> +
> + switch (type) {
> + case TRACE_REG_REGISTER:
> + return enable_trace_eprobe(event, file);
> + case TRACE_REG_UNREGISTER:
> + return disable_trace_eprobe(event, file);
> +#ifdef CONFIG_PERF_EVENTS
> + case TRACE_REG_PERF_REGISTER:
> + case TRACE_REG_PERF_UNREGISTER:

Doesn't this support perf? In that case, you can simplify enable_trace_eprobe()
and disable_trace_eprobe(), because 'file' is always not NULL.

> + case TRACE_REG_PERF_OPEN:
> + case TRACE_REG_PERF_CLOSE:
> + case TRACE_REG_PERF_ADD:
> + case TRACE_REG_PERF_DEL:
> + return 0;
> +#endif
> + }
> + return 0;
> +}
> +
> +static inline void init_trace_eprobe_call(struct trace_eprobe *ep)
> +{
> + struct trace_event_call *call = trace_probe_event_call(&ep->tp);
> +
> + call->flags = TRACE_EVENT_FL_EPROBE;
> + call->event.funcs = &eprobe_funcs;
> + call->class->fields_array = eprobe_fields_array;
> + call->class->reg = eprobe_register;
> +}
> +
> +static int __trace_eprobe_create(int argc, const char *argv[])
> +{
> + /*
> + * Argument syntax:
> + * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS]

Ah, OK. ENAME is also omittable.


> + * Fetch args:
> + * <name>=$<field>[:TYPE]
> + */
> + const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
> + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT;
> + const char *sys_event = NULL, *sys_name = NULL;
> + struct trace_eprobe *ep = NULL;
> + char buf1[MAX_EVENT_NAME_LEN];
> + char buf2[MAX_EVENT_NAME_LEN];
> + char *tmp = NULL;
> + int ret = 0;
> + int i;
> +
> + if (argc < 2)
> + return -ECANCELED;
> +
> + trace_probe_log_init("event_probe", argc, argv);
> +
> + event = strchr(&argv[0][1], ':');
> + if (event) {
> + event++;
> + ret = traceprobe_parse_event_name(&event, &group, buf1,
> + event - argv[0], '/');
> + if (ret)
> + goto parse_error;
> + } else {
> + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
> + sanitize_event_name(buf1);
> + event = buf1;
> + }
> + if (!is_good_name(event) || !is_good_name(group))
> + goto parse_error;
> +
> + sys_event = argv[1];
> + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2,
> + sys_event - argv[1], '.');
> + if (ret || !sys_name)
> + goto parse_error;
> + if (!is_good_name(sys_event) || !is_good_name(sys_name))
> + goto parse_error;
> + ep = alloc_event_probe(group, event, sys_name, sys_event, argc - 2);
> + if (IS_ERR(ep)) {
> + ret = PTR_ERR(ep);
> + /* This must return -ENOMEM, else there is a bug */
> + WARN_ON_ONCE(ret != -ENOMEM);
> + goto error; /* We know ep is not allocated */
> + }
> + ret = trace_eprobe_find(ep);
> + if (ret)
> + goto error;

As I said above, this can be called before "alloc_event_probe()".

> +
> + argc -= 2; argv += 2;
> + /* parse arguments */
> + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
> + tmp = kstrdup(argv[i], GFP_KERNEL);
> + if (!tmp) {
> + ret = -ENOMEM;
> + goto error;
> + }
> + ret = traceprobe_parse_probe_arg(&ep->tp, i, tmp, flags);
> + if (ret == -EINVAL)
> + kfree(tmp);
> + if (ret)
> + goto error; /* This can be -ENOMEM */
> + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) {
> + ret = trace_eprobe_tp_arg_find(ep, i);

Here, as I said above too, below code will be better encapsulated.

/* (code->op check is done inside below function) */
ret = trace_eprobe_tp_arg_update(ep, i);
if (ret)
goto error;

> + if (ret)
> + goto error;
> + }
> + }
> + ret = traceprobe_set_print_fmt(&ep->tp, false);
> + if (ret < 0)
> + goto error;
> + init_trace_eprobe_call(ep);
> + mutex_lock(&event_mutex);
> + ret = trace_probe_register_event_call(&ep->tp);
> + if (ret)
> + goto out_unlock;
> + ret = dyn_event_add(&ep->devent);
> +out_unlock:
> + mutex_unlock(&event_mutex);
> + return ret;
> +
> +parse_error:
> + ret = -EINVAL;
> +error:
> + trace_event_probe_cleanup(ep);
> + return ret;
> +}
> +
> +/*
> + * Register dynevent at core_initcall. This allows kernel to setup eprobe
> + * events in postcore_initcall without tracefs.
> + */
> +static __init int trace_events_eprobe_init_early(void)
> +{
> + int err = 0;
> +
> + err = dyn_event_register(&eprobe_dyn_event_ops);
> + if (err)
> + pr_warn("Could not register eprobe_dyn_event_ops\n");
> +
> + return err;
> +}
> +core_initcall(trace_events_eprobe_init_early);
> diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
> index 949ef09dc537..45f5392fb35c 100644
> --- a/kernel/trace/trace_events_hist.c
> +++ b/kernel/trace/trace_events_hist.c
> @@ -66,7 +66,8 @@
> C(EMPTY_SORT_FIELD, "Empty sort field"), \
> C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
> C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
> - C(INVALID_STR_OPERAND, "String type can not be an operand in expression"),
> + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
> + C(SYNTH_ON_EPROBE, "Synthetic event on eprobe is not supported"),

As I and Steve discussed, I think this can be allowed and loops should be
detected and avoided in the other way. Anyway, this part will be better in
the separated patch.
[...]

> diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> index 15413ad7cef2..5a97317e91fb 100644
> --- a/kernel/trace/trace_probe.c
> +++ b/kernel/trace/trace_probe.c
> @@ -227,12 +227,12 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
>
> /* @buf must has MAX_EVENT_NAME_LEN size */
> int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
> - char *buf, int offset)
> + char *buf, int offset, int delim)
> {
> const char *slash, *event = *pevent;
> int len;
>
> - slash = strchr(event, '/');
> + slash = strchr(event, delim);

As I pointed another mail, I'm OK to use both '/' and '.' as delimiter for
kprobes/uprobes too. But it must be a separated patch.

Thank you,


--
Masami Hiramatsu <mhiramat@xxxxxxxxxx>