Re: [PATCH v2 bpf-next 1/4] tracing/probe: Add PERF_EVENT_IOC_QUERY_PROBE ioctl
From: Andrii Nakryiko
Date: Mon Aug 12 2019 - 11:57:00 EST
On Fri, Aug 9, 2019 at 2:47 PM Daniel Xu <dxu@xxxxxxxxx> wrote:
>
> It's useful to know [uk]probe's nmissed and nhit stats. For example with
> tracing tools, it's important to know when events may have been lost.
> debugfs currently exposes a control file to get this information, but
> it is not compatible with probes registered with the perf API.
>
> While bpf programs may be able to manually count nhit, there is no way
> to gather nmissed. In other words, it is currently not possible to
> retrieve information about FD-based probes.
>
> This patch adds a new ioctl that lets users query nmissed (as well as
> nhit for completeness). We currently only add support for [uk]probes
> but leave the possibility open for other probes like tracepoint.
>
> Signed-off-by: Daniel Xu <dxu@xxxxxxxxx>
> ---
> include/linux/trace_events.h | 12 ++++++++++++
> include/uapi/linux/perf_event.h | 19 +++++++++++++++++++
> kernel/events/core.c | 20 ++++++++++++++++++++
> kernel/trace/trace_kprobe.c | 23 +++++++++++++++++++++++
> kernel/trace/trace_uprobe.c | 23 +++++++++++++++++++++++
> 5 files changed, 97 insertions(+)
>
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 5150436783e8..61558f19696a 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -586,6 +586,12 @@ extern int bpf_get_kprobe_info(const struct perf_event *event,
> u32 *fd_type, const char **symbol,
> u64 *probe_offset, u64 *probe_addr,
> bool perf_type_tracepoint);
> +extern int perf_kprobe_event_query(struct perf_event *event, void __user *info);
> +#else
> +int perf_kprobe_event_query(struct perf_event *event, void __user *info)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif
> #ifdef CONFIG_UPROBE_EVENTS
> extern int perf_uprobe_init(struct perf_event *event,
> @@ -594,6 +600,12 @@ extern void perf_uprobe_destroy(struct perf_event *event);
> extern int bpf_get_uprobe_info(const struct perf_event *event,
> u32 *fd_type, const char **filename,
> u64 *probe_offset, bool perf_type_tracepoint);
> +extern int perf_uprobe_event_query(struct perf_event *event, void __user *info);
> +#else
> +int perf_uprobe_event_query(struct perf_event *event, void __user *info)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif
> extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> char *filter_str);
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 7198ddd0c6b1..65faa9b2a3b4 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -447,6 +447,24 @@ struct perf_event_query_bpf {
> __u32 ids[0];
> };
>
> +/*
> + * Structure used by below PERF_EVENT_IOC_QUERY_PROBE command
> + * to query information about the probe attached to the perf
> + * event. Currently only supports [uk]probes.
> + */
> +struct perf_event_query_probe {
> + /*
> + * Set by the kernel to indicate number of times this probe
> + * was temporarily disabled
> + */
> + __u64 nmissed;
> + /*
> + * Set by the kernel to indicate number of times this probe
> + * was hit
> + */
> + __u64 nhit;
> +};
> +
> /*
> * Ioctls that can be done on a perf event fd:
> */
> @@ -462,6 +480,7 @@ struct perf_event_query_bpf {
> #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
> #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *)
> #define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *)
> +#define PERF_EVENT_IOC_QUERY_PROBE _IOR('$', 12, struct perf_event_query_probe *)
>
> enum perf_event_ioc_flags {
> PERF_IOC_FLAG_GROUP = 1U << 0,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 026a14541a38..3e0fe6eaaad0 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5060,6 +5060,8 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg);
> static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
> static int perf_copy_attr(struct perf_event_attr __user *uattr,
> struct perf_event_attr *attr);
> +static int perf_probe_event_query(struct perf_event *event,
> + void __user *info);
>
> static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
> {
> @@ -5143,6 +5145,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
>
> return perf_event_modify_attr(event, &new_attr);
> }
> +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
> + case PERF_EVENT_IOC_QUERY_PROBE:
> + return perf_probe_event_query(event, (void __user *)arg);
> +#endif
> default:
> return -ENOTTY;
> }
> @@ -8833,6 +8839,20 @@ static inline void perf_tp_register(void)
> #endif
> }
>
> +static int perf_probe_event_query(struct perf_event *event,
> + void __user *info)
> +{
> +#ifdef CONFIG_KPROBE_EVENTS
> + if (event->attr.type == perf_kprobe.type)
> + return perf_kprobe_event_query(event, (void __user *)info);
> +#endif
> +#ifdef CONFIG_UPROBE_EVENTS
> + if (event->attr.type == perf_uprobe.type)
> + return perf_uprobe_event_query(event, (void __user *)info);
> +#endif
> + return -EINVAL;
> +}
> +
> static void perf_event_free_filter(struct perf_event *event)
> {
> ftrace_profile_free_filter(event);
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 9d483ad9bb6c..a734c2d506be 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -196,6 +196,29 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call)
> return within_error_injection_list(trace_kprobe_address(tk));
> }
>
> +int perf_kprobe_event_query(struct perf_event *event, void __user *info)
> +{
> + struct perf_event_query_probe __user *uquery = info;
> + struct perf_event_query_probe query = {};
> + struct trace_event_call *call = event->tp_event;
> + struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
> + u64 nmissed, nhit;
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> + if (copy_from_user(&query, uquery, sizeof(query)))
what about forward/backward compatibility? Didn't you have a size
field for perf_event_query_probe?
> + return -EFAULT;
> +
> + nhit = trace_kprobe_nhit(tk);
> + nmissed = tk->rp.kp.nmissed;
> +
> + if (put_user(nmissed, &uquery->nmissed) ||
> + put_user(nhit, &uquery->nhit))
Wouldn't it be nicer to just do one user put for entire struct (or at
least relevant part of it with backward/forward compatibility?).
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> static int register_kprobe_event(struct trace_kprobe *tk);
> static int unregister_kprobe_event(struct trace_kprobe *tk);
>
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 1ceedb9146b1..5f50386ada59 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1333,6 +1333,29 @@ static inline void init_trace_event_call(struct trace_uprobe *tu)
> call->data = tu;
> }
>
> +int perf_uprobe_event_query(struct perf_event *event, void __user *info)
> +{
> + struct perf_event_query_probe __user *uquery = info;
> + struct perf_event_query_probe query = {};
> + struct trace_event_call *call = event->tp_event;
> + struct trace_uprobe *tu = (struct trace_uprobe *)call->data;
> + u64 nmissed, nhit;
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> + if (copy_from_user(&query, uquery, sizeof(query)))
> + return -EFAULT;
> +
> + nhit = tu->nhit;
> + nmissed = 0;
> +
> + if (put_user(nmissed, &uquery->nmissed) ||
> + put_user(nhit, &uquery->nhit))
> + return -EFAULT;
same questions as above
> +
> + return 0;
> +}
> +
> static int register_uprobe_event(struct trace_uprobe *tu)
> {
> init_trace_event_call(tu);
> --
> 2.20.1
>