[RFC] perf: Delayed userspace unwind (Was: [PATCH v3 00/10] x86: ORC unwinder)

From: Peter Zijlstra
Date: Tue Jul 25 2017 - 07:55:27 EST


On Thu, Jul 13, 2017 at 11:19:11AM +0200, Ingo Molnar wrote:
>
> * Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> > > One gloriously ugly hack would be to delay the userspace unwind to
> > > return-to-userspace, at which point we have a schedulable context and can take
> > > faults.
>
> I don't think it's ugly, and it has various advantages:
>
> > > Of course, then you have to somehow identify this later unwind sample with all
> > > relevant prior samples and stitch the whole thing back together, but that
> > > should be doable.
> > >
> > > In fact, it would not be at all hard to do, just queue a task_work from the
> > > NMI and have that do the EH based unwind.
>
> This would have a couple of advantages:
>
> - as you mention, being able to fault in debug info and generally do
> IO/scheduling,
>
> - profiling overhead would be accounted to the task context that generates it,
> not the NMI context,
>
> - there would be a natural batching/coalescing optimization if multiple events
> hit the same system call: the user-space backtrace would only have to be looked
> up once for all samples that got collected.
>
> This could be done by separating the user-space backtrace into a separate event,
> and perf tooling would then apply the same user-space backtrace to all prior
> kernel samples.
>
> I.e. the ring-buffer would have trace entries like:
>
> [ kernel sample #1, with kernel backtrace #1 ]
> [ kernel sample #2, with kernel backtrace #2 ]
> [ kernel sample #3, with kernel backtrace #3 ]
> [ user-space backtrace #1 at syscall return ]
> ...
>
> Note how the three kernel samples didn't have to do any user-space unwinding at
> all, so the user-space unwinding overhead got reduced by a factor of 3.
>
> Tooling would know that 'user-space backtrace #1' applies to the previous three
> kernel samples.
>
> Or so?

Find compile tested patch below, someone needs to teach this userspace
thing about it though.. Not sure I can still make sense of that code.

---
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 14 ++++++-
kernel/events/callchain.c | 86 ++++++++++++++++++++++++++++++++++++++---
kernel/events/core.c | 18 +++------
4 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3b873fc59e4..241251533e39 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -682,6 +682,7 @@ struct perf_event {
int pending_disable;
struct irq_work pending;

+ struct callback_head pending_callchain;
atomic_t event_limit;

/* address range filters */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5fa3286..342def57ef34 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -368,7 +368,8 @@ struct perf_event_attr {
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
namespaces : 1, /* include namespaces data */
- __reserved_1 : 35;
+ delayed_user_callchain : 1, /* ... */
+ __reserved_1 : 34;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -915,6 +916,17 @@ enum perf_event_type {
*/
PERF_RECORD_NAMESPACES = 16,

+ /*
+ * struct {
+ * struct perf_event_header header;
+ * { u64 nr,
+ * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
+ * struct sample_id sample_id;
+ * };
+ *
+ */
+ PERF_RECORD_CALLCHAIN = 17,
+
PERF_RECORD_MAX, /* non-ABI */
};

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1b2be63c8528..c98a12f3592c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -12,6 +12,7 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <linux/sched/task_stack.h>
+#include <linux/task_work.h>

#include "internal.h"

@@ -178,19 +179,94 @@ put_callchain_entry(int rctx)
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}

+static struct perf_callchain_entry __empty = { .nr = 0, };
+
+static void perf_callchain_work(struct callback_head *work)
+{
+ struct perf_event *event = container_of(work, struct perf_event, pending_callchain);
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ size_t size;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ } callchain_event = {
+ .header = {
+ .type = PERF_RECORD_CALLCHAIN,
+ .misc = 0,
+ .size = sizeof(callchain_event),
+ },
+ };
+
+ perf_event_header__init_id(&callchain_event.header, &sample, event);
+
+ sample.callchain = get_perf_callchain(task_pt_regs(current),
+ /* init_nr */ 0,
+ /* kernel */ false,
+ /* user */ true,
+ event->attr.sample_max_stack,
+ /* crosstask */ false,
+ /* add_mark */ true);
+
+ if (!sample.callchain)
+ sample.callchain = &__empty;
+
+ size = sizeof(u64) * (1 + sample.callchain->nr);
+ callchain_event.header.size += size;
+
+ ret = perf_output_begin(&handle, event, callchain_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, callchain_event);
+ __output_copy(&handle, sample.callchain, size);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+
+ barrier();
+ work->func = NULL; /* done */
+}
+
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
- bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
+ bool kernel = !event->attr.exclude_callchain_kernel;
+ bool user = !event->attr.exclude_callchain_user;
+ bool delayed = event->attr.delayed_user_callchain;
+
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;

- if (!kernel && !user)
- return NULL;
+ struct perf_callchain_entry *callchain = NULL;
+
+ if (user && delayed && !crosstask) {
+ struct callback_head *work = &event->pending_callchain;
+
+ if (!work->func) {
+ work->func = perf_callchain_work;
+ /*
+ * We cannot do set_notify_resume() from NMI context,
+ * also, knowing we are already in an interrupted
+ * context and will pass return to userspace, we can
+ * simply set TIF_NOTIFY_RESUME.
+ */
+ task_work_add(current, work, false);
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ }
+ user = false;
+ }
+
+ if (kernel || user) {
+ callchain = get_perf_callchain(regs, 0, kernel, user,
+ max_stack, crosstask, true);
+ }
+
+ if (!callchain)
+ callchain = &__empty;

- return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
+ return callchain;
}

struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..26aed7bfbb6a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5857,19 +5857,12 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_read(handle, event);

if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- if (data->callchain) {
- int size = 1;
-
- if (data->callchain)
- size += data->callchain->nr;
+ int size = 1;

- size *= sizeof(u64);
+ size += data->callchain->nr;
+ size *= sizeof(u64);

- __output_copy(handle, data->callchain, size);
- } else {
- u64 nr = 0;
- perf_output_put(handle, nr);
- }
+ __output_copy(handle, data->callchain, size);
}

if (sample_type & PERF_SAMPLE_RAW) {
@@ -6010,8 +6003,7 @@ void perf_prepare_sample(struct perf_event_header *header,

data->callchain = perf_callchain(event, regs);

- if (data->callchain)
- size += data->callchain->nr;
+ size += data->callchain->nr;

header->size += size * sizeof(u64);
}