Re: [PATCH v13 03/12] tracing/probes: Add fprobe events for tracing function entry and exit.

From: Google
Date: Sun May 28 2023 - 23:16:53 EST


On Fri, 26 May 2023 21:38:44 +0200
Florent Revest <revest@xxxxxxxxxxxx> wrote:

> On Fri, May 26, 2023 at 6:18 AM Masami Hiramatsu (Google)
> <mhiramat@xxxxxxxxxx> wrote:
> >
> > [...] Since
> > CONFIG_KPROBES_ON_FTRACE requires the CONFIG_DYNAMIC_FTRACE_WITH_REGS,
> > it is not available if the architecture only supports
> > CONFIG_DYNAMIC_FTRACE_WITH_ARGS. And that means kprobe events can not
> > probe function entry/exit effectively on such architecture.
> > But this can be solved if the dynamic events supports fprobe events.
>
> Currently CONFIG_FPROBE also requires CONFIG_DYNAMIC_FTRACE_WITH_REGS
> so iiuc this will only be true when we'll have migrated fprobe to use
> ftrace_regs instead of pt_regs right ?

Sorry for confusion, yes, that's right. Currently does, but I will remove
that.

>
> We discussed having fprobe use ftrace_regs instead of pt_regs in the
> past and I even had a proof of concept branch at one point but this
> patch seems to make this transition quite a bit harder. Have you tried
> to make fprobe work on ftrace_regs on top of this patch ?

No, not yet, but taht should not be so hard. Let me try.

Thank you!

>
> > diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
> > new file mode 100644
> > index 000000000000..48dbbc72b7dd
> > --- /dev/null
> > +++ b/kernel/trace/trace_fprobe.c
> > @@ -0,0 +1,1053 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Fprobe-based tracing events
> > + * Copyright (C) 2022 Google LLC.
> > + */
> > +#define pr_fmt(fmt) "trace_fprobe: " fmt
> > +
> > +#include <linux/fprobe.h>
> > +#include <linux/module.h>
> > +#include <linux/rculist.h>
> > +#include <linux/security.h>
> > +#include <linux/uaccess.h>
> > +
> > +#include "trace_dynevent.h"
> > +#include "trace_probe.h"
> > +#include "trace_probe_kernel.h"
> > +#include "trace_probe_tmpl.h"
> > +
> > +#define FPROBE_EVENT_SYSTEM "fprobes"
> > +#define RETHOOK_MAXACTIVE_MAX 4096
> > +
> > +static int trace_fprobe_create(const char *raw_command);
> > +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
> > +static int trace_fprobe_release(struct dyn_event *ev);
> > +static bool trace_fprobe_is_busy(struct dyn_event *ev);
> > +static bool trace_fprobe_match(const char *system, const char *event,
> > + int argc, const char **argv, struct dyn_event *ev);
> > +
> > +static struct dyn_event_operations trace_fprobe_ops = {
> > + .create = trace_fprobe_create,
> > + .show = trace_fprobe_show,
> > + .is_busy = trace_fprobe_is_busy,
> > + .free = trace_fprobe_release,
> > + .match = trace_fprobe_match,
> > +};
> > +
> > +/*
> > + * Fprobe event core functions
> > + */
> > +struct trace_fprobe {
> > + struct dyn_event devent;
> > + struct fprobe fp;
> > + const char *symbol;
> > + struct trace_probe tp;
> > +};
> > +
> > +static bool is_trace_fprobe(struct dyn_event *ev)
> > +{
> > + return ev->ops == &trace_fprobe_ops;
> > +}
> > +
> > +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev)
> > +{
> > + return container_of(ev, struct trace_fprobe, devent);
> > +}
> > +
> > +/**
> > + * for_each_trace_fprobe - iterate over the trace_fprobe list
> > + * @pos: the struct trace_fprobe * for each entry
> > + * @dpos: the struct dyn_event * to use as a loop cursor
> > + */
> > +#define for_each_trace_fprobe(pos, dpos) \
> > + for_each_dyn_event(dpos) \
> > + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos)))
> > +
> > +static bool trace_fprobe_is_return(struct trace_fprobe *tf)
> > +{
> > + return tf->fp.exit_handler != NULL;
> > +}
> > +
> > +static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
> > +{
> > + return tf->symbol ? tf->symbol : "unknown";
> > +}
> > +
> > +static bool trace_fprobe_is_busy(struct dyn_event *ev)
> > +{
> > + struct trace_fprobe *tf = to_trace_fprobe(ev);
> > +
> > + return trace_probe_is_enabled(&tf->tp);
> > +}
> > +
> > +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf,
> > + int argc, const char **argv)
> > +{
> > + char buf[MAX_ARGSTR_LEN + 1];
> > +
> > + if (!argc)
> > + return true;
> > +
> > + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf));
> > + if (strcmp(buf, argv[0]))
> > + return false;
> > + argc--; argv++;
> > +
> > + return trace_probe_match_command_args(&tf->tp, argc, argv);
> > +}
> > +
> > +static bool trace_fprobe_match(const char *system, const char *event,
> > + int argc, const char **argv, struct dyn_event *ev)
> > +{
> > + struct trace_fprobe *tf = to_trace_fprobe(ev);
> > +
> > + if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event))
> > + return false;
> > +
> > + if (system && strcmp(trace_probe_group_name(&tf->tp), system))
> > + return false;
> > +
> > + return trace_fprobe_match_command_head(tf, argc, argv);
> > +}
> > +
> > +static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
> > +{
> > + return fprobe_is_registered(&tf->fp);
> > +}
> > +
> > +/*
> > + * Note that we don't verify the fetch_insn code, since it does not come
> > + * from user space.
> > + */
> > +static int
> > +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
> > + void *base)
> > +{
> > + struct pt_regs *regs = rec;
>
> I gave it a try this week and it was mostly a matter of replacing
> pt_regs with ftrace_regs in this file. Like here for example. Not too
> bad so far.
>
> > + unsigned long val;
> > + int ret;
> > +
> > +retry:
> > + /* 1st stage: get value from context */
> > + switch (code->op) {
> > + case FETCH_OP_STACK:
> > + val = regs_get_kernel_stack_nth(regs, code->param);
>
> This does not have a ftrace_regs equivalent at the moment. I suppose
> we could introduce one without too much effort so that's probably ok.
>
> > + break;
> > + case FETCH_OP_STACKP:
> > + val = kernel_stack_pointer(regs);
> > + break;
> > + case FETCH_OP_RETVAL:
> > + val = regs_return_value(regs);
> > + break;
> > +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
> > + case FETCH_OP_ARG:
> > + val = regs_get_kernel_argument(regs, code->param);
> > + break;
> > +#endif
> > + case FETCH_NOP_SYMBOL: /* Ignore a place holder */
> > + code++;
> > + goto retry;
> > + default:
> > + ret = process_common_fetch_insn(code, &val);
> > + if (ret < 0)
> > + return ret;
> > + }
> > + code++;
> > +
> > + return process_fetch_insn_bottom(code, val, dest, base);
> > +}
> > +NOKPROBE_SYMBOL(process_fetch_insn)
> > +
> > +/* function entry handler */
> > +static nokprobe_inline void
> > +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + struct pt_regs *regs,
> > + struct trace_event_file *trace_file)
> > +{
> > + struct fentry_trace_entry_head *entry;
> > + struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> > + struct trace_event_buffer fbuffer;
> > + int dsize;
> > +
> > + if (WARN_ON_ONCE(call != trace_file->event_call))
> > + return;
> > +
> > + if (trace_trigger_soft_disabled(trace_file))
> > + return;
> > +
> > + dsize = __get_data_size(&tf->tp, regs);
> > +
> > + entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> > + sizeof(*entry) + tf->tp.size + dsize);
> > + if (!entry)
> > + return;
> > +
> > + fbuffer.regs = regs;
> > + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
> > + entry->ip = entry_ip;
> > + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> > +
> > + trace_event_buffer_commit(&fbuffer);
> > +}
> > +
> > +static void
> > +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + struct pt_regs *regs)
> > +{
> > + struct event_file_link *link;
> > +
> > + trace_probe_for_each_link_rcu(link, &tf->tp)
> > + __fentry_trace_func(tf, entry_ip, regs, link->file);
> > +}
> > +NOKPROBE_SYMBOL(fentry_trace_func);
> > +
> > +/* Kretprobe handler */
> > +static nokprobe_inline void
> > +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + unsigned long ret_ip, struct pt_regs *regs,
> > + struct trace_event_file *trace_file)
> > +{
> > + struct fexit_trace_entry_head *entry;
> > + struct trace_event_buffer fbuffer;
> > + struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> > + int dsize;
> > +
> > + if (WARN_ON_ONCE(call != trace_file->event_call))
> > + return;
> > +
> > + if (trace_trigger_soft_disabled(trace_file))
> > + return;
> > +
> > + dsize = __get_data_size(&tf->tp, regs);
> > +
> > + entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> > + sizeof(*entry) + tf->tp.size + dsize);
> > + if (!entry)
> > + return;
> > +
> > + fbuffer.regs = regs;
> > + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
> > + entry->func = entry_ip;
> > + entry->ret_ip = ret_ip;
> > + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> > +
> > + trace_event_buffer_commit(&fbuffer);
> > +}
> > +
> > +static void
> > +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + unsigned long ret_ip, struct pt_regs *regs)
> > +{
> > + struct event_file_link *link;
> > +
> > + trace_probe_for_each_link_rcu(link, &tf->tp)
> > + __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
> > +}
> > +NOKPROBE_SYMBOL(fexit_trace_func);
> > +
> > +#ifdef CONFIG_PERF_EVENTS
> > +
> > +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + struct pt_regs *regs)
> > +{
> > + struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> > + struct fentry_trace_entry_head *entry;
> > + struct hlist_head *head;
> > + int size, __size, dsize;
> > + int rctx;
> > +
> > + head = this_cpu_ptr(call->perf_events);
> > + if (hlist_empty(head))
> > + return 0;
> > +
> > + dsize = __get_data_size(&tf->tp, regs);
> > + __size = sizeof(*entry) + tf->tp.size + dsize;
> > + size = ALIGN(__size + sizeof(u32), sizeof(u64));
> > + size -= sizeof(u32);
> > +
> > + entry = perf_trace_buf_alloc(size, NULL, &rctx);
> > + if (!entry)
> > + return 0;
> > +
> > + entry->ip = entry_ip;
> > + memset(&entry[1], 0, dsize);
> > + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> > + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
>
> However, that call concerns me. Perf requires a pt_regs pointer here
> and it expects certain specific fields of that pt_regs to be set (the
> exact requirements don't seem to be explicitly stated anywhere).
>
> For example, on arm64 (the architecture without
> CONFIG_DYNAMIC_FTRACE_WITH_REGS on which I'd like to have fprobe...
> :)) perf calls the user_mode(regs) macro which expects the pstate
> register to be set in pt_regs. However, pstate is invalid outside of
> an exception entry so ftrace_regs on arm64 does not have a pstate
> field at all.
>
> If we migrate fprobe to ftrace_regs and try to construct a sparse
> pt_regs out of a ftrace_regs here, it wouldn't be enough to just copy
> all the registers we know from ftrace_regs into the pt_regs: we would
> also need to make up the pstate register with the knowledge that it
> has to be set in certain way specifically to please perf... Arch code
> wouldn't only have to provide a
> "expand_ftrace_regs_into_sparse_pt_regs" macro but also a
> "invent_registers_for_perf" similar to the current
> perf_arch_fetch_caller_regs macro. This seems rough...
>
> It sounds to me like we should have avoided the use of sparse and
> "made up" pt_regs a long time back and the more users of fprobe that
> expect pt_regs we add, the more difficult we make it to make fprobe
> work on CONFIG_DYNAMIC_FTRACE_WITH_ARGS.
>
> > + head, NULL);
> > + return 0;
> > +}
> > +NOKPROBE_SYMBOL(fentry_perf_func);
> > +
> > +static void
> > +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + unsigned long ret_ip, struct pt_regs *regs)
> > +{
> > + struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> > + struct fexit_trace_entry_head *entry;
> > + struct hlist_head *head;
> > + int size, __size, dsize;
> > + int rctx;
> > +
> > + head = this_cpu_ptr(call->perf_events);
> > + if (hlist_empty(head))
> > + return;
> > +
> > + dsize = __get_data_size(&tf->tp, regs);
> > + __size = sizeof(*entry) + tf->tp.size + dsize;
> > + size = ALIGN(__size + sizeof(u32), sizeof(u64));
> > + size -= sizeof(u32);
> > +
> > + entry = perf_trace_buf_alloc(size, NULL, &rctx);
> > + if (!entry)
> > + return;
> > +
> > + entry->func = entry_ip;
> > + entry->ret_ip = ret_ip;
> > + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> > + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
> > + head, NULL);
> > +}
> > +NOKPROBE_SYMBOL(fexit_perf_func);
> > +#endif /* CONFIG_PERF_EVENTS */
> > +
> > +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
> > + unsigned long ret_ip, struct pt_regs *regs,
> > + void *entry_data)
> > +{
> > + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
> > + int ret = 0;
> > +
> > + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
> > + fentry_trace_func(tf, entry_ip, regs);
> > +#ifdef CONFIG_PERF_EVENTS
> > + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
> > + ret = fentry_perf_func(tf, entry_ip, regs);
> > +#endif
> > + return ret;
> > +}
> > +NOKPROBE_SYMBOL(fentry_dispatcher);
> > +
> > +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
> > + unsigned long ret_ip, struct pt_regs *regs,
> > + void *entry_data)
> > +{
> > + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
> > +
> > + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
> > + fexit_trace_func(tf, entry_ip, ret_ip, regs);
> > +#ifdef CONFIG_PERF_EVENTS
> > + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
> > + fexit_perf_func(tf, entry_ip, ret_ip, regs);
> > +#endif
> > +}
> > +NOKPROBE_SYMBOL(fexit_dispatcher);


--
Masami Hiramatsu (Google) <mhiramat@xxxxxxxxxx>