[PATCH bpf-next 1/5] perf/core: Add PERF_FORMAT_LOST read_format

From: Daniel Xu
Date: Tue Sep 17 2019 - 09:31:36 EST


It's useful to know kprobe's nmissed count. For example with tracing
tools, it's important to know when events may have been lost. debugfs
currently exposes a control file to get this information, but it is not
compatible with probes registered with the perf API.

While bpf programs may be able to manually count nhit, there is no way
to gather nmissed. In other words, it is currently not possible to this
retrieve information about FD-based probes.

This patch adds a new field to perf's read_format that lets users query
misses. Misses include both misses from the underlying kprobe
infrastructure and misses from ringbuffer infrastructure.

Signed-off-by: Daniel Xu <dxu@xxxxxxxxx>
---
include/linux/trace_events.h | 1 +
include/uapi/linux/perf_event.h | 5 ++++-
kernel/events/core.c | 39 ++++++++++++++++++++++++++++++---
kernel/trace/trace_kprobe.c | 8 +++++++
4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 30a8cdcfd4a4..952520c1240a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -587,6 +587,7 @@ extern int bpf_get_kprobe_info(const struct perf_event *event,
u32 *fd_type, const char **symbol,
u64 *probe_offset, u64 *probe_addr,
bool perf_type_tracepoint);
+extern u64 perf_kprobe_missed(const struct perf_event *event);
#endif
#ifdef CONFIG_UPROBE_EVENTS
extern int perf_uprobe_init(struct perf_event *event,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7198ddd0c6b1..bd874c7257f0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -273,6 +273,7 @@ enum {
* { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 id; } && PERF_FORMAT_ID
+ * { u64 missed; } && PERF_FORMAT_LOST
* } && !PERF_FORMAT_GROUP
*
* { u64 nr;
@@ -280,6 +281,7 @@ enum {
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 value;
* { u64 id; } && PERF_FORMAT_ID
+ * { u64 missed; } && PERF_FORMAT_LOST
* } cntr[nr];
* } && PERF_FORMAT_GROUP
* };
@@ -289,8 +291,9 @@ enum perf_event_read_format {
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
PERF_FORMAT_ID = 1U << 2,
PERF_FORMAT_GROUP = 1U << 3,
+ PERF_FORMAT_LOST = 1U << 4,

- PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
+ PERF_FORMAT_MAX = 1U << 5, /* non-ABI */
};

#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0463c1151bae..ee08d3ed6299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1715,6 +1715,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);

+ if (event->attr.read_format & PERF_FORMAT_LOST)
+ entry += sizeof(u64);
+
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
@@ -4734,6 +4737,24 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

+static struct pmu perf_kprobe;
+static u64 perf_event_lost(struct perf_event *event)
+{
+ struct ring_buffer *rb;
+ u64 lost = 0;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (likely(!!rb))
+ lost += local_read(&rb->lost);
+ rcu_read_unlock();
+
+ if (event->attr.type == perf_kprobe.type)
+ lost += perf_kprobe_missed(event);
+
+ return lost;
+}
+
static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
@@ -4770,11 +4791,15 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = perf_event_lost(leader);

for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = perf_event_lost(sub);
}

raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -4831,7 +4856,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;

values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -4841,6 +4866,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = perf_event_lost(event);

if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -6141,7 +6168,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;

values[n++] = perf_event_count(event);
@@ -6155,6 +6182,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = perf_event_lost(event);

__output_copy(handle, values, n * sizeof(u64));
}
@@ -6165,7 +6194,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ u64 values[6];
int n = 0;

values[n++] = 1 + leader->nr_siblings;
@@ -6183,6 +6212,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = perf_event_lost(leader);

__output_copy(handle, values, n * sizeof(u64));

@@ -6196,6 +6227,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = perf_event_lost(sub);

__output_copy(handle, values, n * sizeof(u64));
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9d483ad9bb6c..cff471c8750b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -196,6 +196,14 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call)
return within_error_injection_list(trace_kprobe_address(tk));
}

+u64 perf_kprobe_missed(const struct perf_event *event)
+{
+ struct trace_event_call *call = event->tp_event;
+ struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+
+ return tk->rp.kp.nmissed;
+}
+
static int register_kprobe_event(struct trace_kprobe *tk);
static int unregister_kprobe_event(struct trace_kprobe *tk);

--
2.21.0