[RFC PATCH 4/6] perf: Add infrastructure for using AUX data in perf samples

From: Alexander Shishkin
Date: Fri Sep 23 2016 - 07:30:10 EST


AUX data can be used to annotate perf events such as performance counters
or tracepoints/breakpoints by including it in sample records when
PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
and profiling by providing, for example, a history of instruction flow
leading up to the event's overflow.

To facilitate this, this patch adds code to create a kernel counter with a
ring buffer to track and collect AUX data that is then copied out into the
sampled events' perf data stream as samples.

The user interface is extended to allow for this, new attribute fields are
added:

* aux_sample_type: specify PMU on which the AUX data generating event
is created;
* aux_sample_config: event config (maps to attribute's config field),
* aux_sample_size: size of the sample to be written.

This kernel counter is configured similarly to the event that is being
annotated with regards to filtering (exclude_{hv,idle,user,kernel}) and
enabled state (disabled, enable_on_exec) to make sure that the sampler
is not tracking any out of context activity. One sampler can be used
for multiple events.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 12 ++
include/uapi/linux/perf_event.h | 16 +-
kernel/events/core.c | 315 +++++++++++++++++++++++++++++++++++++++-
3 files changed, 341 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c5362584a..7121cf7b5c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -101,6 +101,12 @@ struct perf_branch_stack {
struct perf_branch_entry entries[0];
};

+struct perf_aux_record {
+ u64 size;
+ unsigned long from;
+ unsigned long to;
+};
+
struct task_struct;

/*
@@ -532,6 +538,7 @@ struct swevent_hlist {
#define PERF_ATTACH_GROUP 0x02
#define PERF_ATTACH_TASK 0x04
#define PERF_ATTACH_TASK_DATA 0x08
+#define PERF_ATTACH_SAMPLING 0x10

struct perf_cgroup;
struct ring_buffer;
@@ -691,6 +698,9 @@ struct perf_event {
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;

+ struct perf_event *aux_sampler;
+ atomic_long_t aux_samplees_count;
+
#ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event;
struct event_filter *filter;
@@ -888,6 +898,7 @@ struct perf_sample_data {
*/
u64 addr;
struct perf_raw_record *raw;
+ struct perf_aux_record aux;
struct perf_branch_stack *br_stack;
u64 period;
u64 weight;
@@ -937,6 +948,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
/* remaining struct members initialized in perf_prepare_sample() */
data->addr = addr;
data->raw = NULL;
+ data->aux.from = data->aux.to = data->aux.size = 0;
data->br_stack = NULL;
data->period = period;
data->weight = 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24..1bf3f2c358 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_AUX = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
@@ -273,6 +274,9 @@ enum perf_event_read_format {
/* add: sample_stack_user */
#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */
#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6 136 /* add: aux_sample_type */
+ /* add: aux_sample_config */
+ /* add: aux_sample_size */

/*
* Hardware event_id to monitor via a performance monitoring event:
@@ -390,6 +394,14 @@ struct perf_event_attr {
__u32 aux_watermark;
__u16 sample_max_stack;
__u16 __reserved_2; /* align to __u64 */
+
+ /*
+ * AUX area sampling configuration
+ */
+ __u64 aux_sample_config; /* event config for AUX sampling */
+ __u64 aux_sample_size; /* desired sample size */
+ __u32 aux_sample_type; /* pmu::type of an AUX PMU */
+ __u32 __reserved_3; /* align to __u64 */
};

#define perf_flags(attr) (*(&(attr)->read_format + 1))
@@ -773,6 +785,8 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 size;
+ * char data[size]; } && PERF_SAMPLE_AUX
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b64a5c611f..fdb20fdeb1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2422,6 +2422,25 @@ static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;

+ if (event->aux_sampler) {
+ struct perf_event_context *sctx = event->aux_sampler->ctx;
+
+ lockdep_assert_held(&ctx->mutex);
+
+ if (sctx != ctx) {
+ sctx = perf_event_ctx_lock_nested(event->aux_sampler,
+ SINGLE_DEPTH_NESTING);
+ if (WARN_ON_ONCE(!sctx))
+ goto done;
+ }
+
+ _perf_event_enable(event->aux_sampler);
+
+ if (sctx != ctx)
+ perf_event_ctx_unlock(event->aux_sampler, sctx);
+ }
+
+done:
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state < PERF_EVENT_STATE_ERROR) {
@@ -3855,6 +3874,8 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}

+static void perf_aux_sampler_fini(struct perf_event *event);
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -3886,6 +3907,9 @@ static void unaccount_event(struct perf_event *event)
schedule_delayed_work(&perf_sched_work, HZ);
}

+ if ((event->attr.sample_type & PERF_SAMPLE_AUX))
+ perf_aux_sampler_fini(event);
+
unaccount_event_cpu(event, event->cpu);

unaccount_pmu_sb_event(event);
@@ -3993,6 +4017,23 @@ static void _free_event(struct perf_event *event)

unaccount_event(event);

+ if (kernel_rb_event(event)) {
+ struct perf_event_context *ctx = event->ctx;
+ unsigned long flags;
+
+ /*
+ * This event may not be explicitly freed by
+ * perf_event_release_kernel(), we still need to remove it
+ * from its context.
+ */
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ list_del_event(event, ctx);
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ ring_buffer_unaccount(event->rb, false);
+ rb_free_kernel(event->rb, event);
+ }
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -5455,6 +5496,232 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}

+struct perf_event *__find_sampling_counter(struct perf_event_context *ctx,
+ struct perf_event *event,
+ struct task_struct *task)
+{
+ struct perf_event *sampler = NULL;
+
+ list_for_each_entry(sampler, &ctx->event_list, event_entry) {
+ if (kernel_rb_event(sampler) &&
+ sampler->cpu == event->cpu &&
+ sampler->attr.type == event->attr.aux_sample_type &&
+ sampler->attr.config == event->attr.aux_sample_config &&
+ sampler->attr.exclude_hv == event->attr.exclude_hv &&
+ sampler->attr.exclude_idle == event->attr.exclude_idle &&
+ sampler->attr.exclude_user == event->attr.exclude_user &&
+ sampler->attr.exclude_kernel == event->attr.exclude_kernel &&
+ sampler->attr.aux_sample_size >= event->attr.aux_sample_size &&
+ atomic_long_inc_not_zero(&sampler->refcount))
+ return sampler;
+ }
+
+ return NULL;
+}
+
+struct perf_event *find_sampling_counter(struct pmu *pmu,
+ struct perf_event *event,
+ struct task_struct *task)
+{
+ struct perf_event *sampler = NULL;
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ unsigned long flags;
+
+ if (!task) {
+ if (!cpu_online(event->cpu))
+ return NULL;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, event->cpu);
+ ctx = &cpuctx->ctx;
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ } else {
+ ctx = perf_lock_task_context(task, pmu->task_ctx_nr, &flags);
+
+ if (!ctx)
+ return NULL;
+ }
+
+ sampler = __find_sampling_counter(ctx, event, task);
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ return sampler;
+}
+
+/*
+ * Sampling AUX data in perf events is done by means of a kernel event that
+ * collects data to its own ring_buffer. This data gets copied out into sampled
+ * event's SAMPLE_AUX records every time the sampled event overflows. One such
+ * kernel event (sampler) can be used to provide samples for multiple events
+ * (samplees) on the same context if their attributes match. Each samplee
+ * holds a reference to the sampler event; the last one out frees the sampler;
+ * perf_event_exit_task() is instructed not to free samplers directly.
+ */
+static int perf_aux_sampler_init(struct perf_event *event,
+ struct task_struct *task,
+ struct pmu *pmu)
+{
+ struct perf_event_attr attr;
+ struct perf_event *sampler;
+ unsigned long nr_pages;
+ int ret;
+
+ if (!pmu || !pmu->setup_aux)
+ return -ENOTSUPP;
+
+ sampler = find_sampling_counter(pmu, event, task);
+ if (!sampler) {
+ memset(&attr, 0, sizeof(attr));
+ attr.type = pmu->type;
+ attr.config = event->attr.aux_sample_config;
+ attr.disabled = 1; /* see below */
+ attr.enable_on_exec = event->attr.enable_on_exec;
+ attr.exclude_hv = event->attr.exclude_hv;
+ attr.exclude_idle = event->attr.exclude_idle;
+ attr.exclude_user = event->attr.exclude_user;
+ attr.exclude_kernel = event->attr.exclude_kernel;
+ attr.aux_sample_size = event->attr.aux_sample_size;
+
+ sampler = perf_event_create_kernel_counter(&attr, event->cpu,
+ task, NULL, NULL);
+ if (IS_ERR(sampler))
+ return PTR_ERR(sampler);
+
+ nr_pages = 1ul << __get_order(event->attr.aux_sample_size);
+
+ ret = rb_alloc_kernel(sampler, 0, nr_pages);
+ if (ret) {
+ perf_event_release_kernel(sampler);
+ return ret;
+ }
+
+ /*
+ * This event will be freed by the last exiting samplee;
+ * perf_event_exit_task() should skip it over.
+ */
+ sampler->attach_state |= PERF_ATTACH_SAMPLING;
+ }
+
+ event->aux_sampler = sampler;
+
+ if (!atomic_long_inc_return(&sampler->aux_samplees_count)) {
+ /*
+ * enable the sampler here unless the original event wants
+ * to stay disabled
+ */
+ if (!event->attr.disabled)
+ perf_event_enable(sampler);
+ }
+
+ return 0;
+}
+
+static void perf_aux_sampler_fini(struct perf_event *event)
+{
+ struct perf_event *sampler = event->aux_sampler;
+
+ if (!sampler)
+ return;
+
+ /*
+ * We're holding a reference to the sampler, so it's always
+ * valid here.
+ */
+ if (atomic_long_dec_and_test(&sampler->aux_samplees_count))
+ perf_event_disable(sampler);
+
+ /* can be last */
+ put_event(sampler);
+
+ event->aux_sampler = NULL;
+}
+
+static unsigned long perf_aux_sampler_trace(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_sampler;
+ struct ring_buffer *rb;
+ int *disable_count;
+
+ data->aux.size = 0;
+
+ if (!sampler || READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)
+ goto out;
+
+ if (READ_ONCE(sampler->oncpu) != smp_processor_id())
+ goto out;
+
+ /*
+ * Non-zero disable count here means that we, being the NMI
+ * context, are racing with pmu::add or pmu::del, both of which
+ * may lead to a dangling hardware event and all manner of mayhem.
+ */
+ disable_count = this_cpu_ptr(sampler->pmu->pmu_disable_count);
+ if (*disable_count)
+ goto out;
+
+ perf_pmu_disable(sampler->pmu);
+
+ rb = ring_buffer_get(sampler);
+ if (!rb) {
+ perf_pmu_enable(sampler->pmu);
+ goto out;
+ }
+
+ sampler->pmu->stop(sampler, PERF_EF_UPDATE);
+
+ data->aux.to = local_read(&rb->aux_head);
+
+ if (data->aux.to < sampler->attr.aux_sample_size)
+ data->aux.from = rb->aux_nr_pages * PAGE_SIZE +
+ data->aux.to - sampler->attr.aux_sample_size;
+ else
+ data->aux.from = data->aux.to -
+ sampler->attr.aux_sample_size;
+ data->aux.size = ALIGN(sampler->attr.aux_sample_size, sizeof(u64));
+ ring_buffer_put(rb);
+
+out:
+ return data->aux.size;
+}
+
+static void perf_aux_sampler_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_sampler;
+ struct ring_buffer *rb;
+ unsigned long pad;
+ int ret;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux.size))
+ goto out_enable;
+
+ rb = ring_buffer_get(sampler);
+ if (WARN_ON_ONCE(!rb))
+ goto out_enable;
+
+ ret = rb_output_aux(rb, data->aux.from, data->aux.to,
+ (aux_copyfn)perf_output_copy, handle);
+ if (ret < 0) {
+ pr_warn_ratelimited("failed to copy trace data\n");
+ goto out;
+ }
+
+ pad = data->aux.size - ret;
+ if (pad) {
+ u64 p = 0;
+
+ perf_output_copy(handle, &p, pad);
+ }
+out:
+ ring_buffer_put(rb);
+ sampler->pmu->start(sampler, 0);
+
+out_enable:
+ perf_pmu_enable(sampler->pmu);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -5774,6 +6041,13 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux.size);
+
+ if (data->aux.size)
+ perf_aux_sampler_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -5907,6 +6181,14 @@ void perf_prepare_sample(struct perf_event_header *header,

header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size = sizeof(u64);
+
+ size += perf_aux_sampler_trace(event, data);
+
+ header->size += size;
+ }
}

static void __always_inline
@@ -6109,6 +6391,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);

+ perf_pmu_enable(event->pmu);
+
if (restart)
perf_event_stop(event, 1);
}
@@ -6673,6 +6957,8 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);

+ perf_pmu_enable(event->pmu);
+
if (restart)
perf_event_stop(event, 1);
}
@@ -9076,10 +9362,27 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}

if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_AUX) {
+ struct pmu *aux_pmu;
+ int idx;
+
+ err = -EINVAL;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ aux_pmu = __perf_find_pmu(event->attr.aux_sample_type);
+ if (aux_pmu)
+ err = perf_aux_sampler_init(event, task,
+ aux_pmu);
+ srcu_read_unlock(&pmus_srcu, idx);
+
+ if (err)
+ goto err_addr_filters;
+ }
+
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_addr_filters;
+ goto err_aux_sampler;
}
}

@@ -9088,6 +9391,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,

return event;

+err_aux_sampler:
+ perf_aux_sampler_fini(event);
+
err_addr_filters:
kfree(event->addr_filters_offs);

@@ -9917,6 +10223,13 @@ perf_event_exit_event(struct perf_event *child_event,
struct perf_event *parent_event = child_event->parent;

/*
+ * Skip over samplers, they are released by the last holder
+ * of their reference.
+ */
+ if (child_event->attach_state & PERF_ATTACH_SAMPLING)
+ return;
+
+ /*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
* random child task.
--
2.9.3