[PATCH v1 4/6] perf: Allow using AUX data in perf samples
From: Alexander Shishkin
Date: Tue Jun 12 2018 - 03:52:29 EST
AUX data can be used to annotate perf events such as performance counters
or tracepoints/breakpoints by including it in sample records when
PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
and profiling by providing, for example, a history of instruction flow
leading up to the event's overflow.
To do this, the AUX event's file descriptor is passed to the perf syscall
with PERF_FLAG_FD_SAMPLE flag set and PERF_SAMPLE_AUX bit set in the sample
type. Also, a new attribute field is added to allow the user to specify the
desired size of the AUX sample: attr.aux_sample_size.
Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 10 ++
include/uapi/linux/perf_event.h | 8 +-
kernel/events/core.c | 158 +++++++++++++++++++++++++++++++-
3 files changed, 174 insertions(+), 2 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7546822a1d74..9f9e341d45cf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -102,6 +102,12 @@ struct perf_branch_stack {
struct perf_branch_entry entries[0];
};
+struct perf_aux_record {
+ u64 size;
+ unsigned long from;
+ unsigned long to;
+};
+
struct task_struct;
/*
@@ -674,6 +680,8 @@ struct perf_event {
struct bpf_prog *prog;
#endif
+ struct perf_event *sample_event;
+
#ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event;
struct event_filter *filter;
@@ -882,6 +890,7 @@ struct perf_sample_data {
*/
u64 addr;
struct perf_raw_record *raw;
+ struct perf_aux_record aux;
struct perf_branch_stack *br_stack;
u64 period;
u64 weight;
@@ -933,6 +942,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
/* remaining struct members initialized in perf_prepare_sample() */
data->addr = addr;
data->raw = NULL;
+ data->aux.from = data->aux.to = data->aux.size = 0;
data->br_stack = NULL;
data->period = period;
data->weight = 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c77c9a2ebbbb..19a22b161e39 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
+ PERF_SAMPLE_AUX = 1U << 20,
- PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
};
/*
@@ -298,6 +299,7 @@ enum perf_event_read_format {
/* add: sample_stack_user */
#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */
#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */
/*
* Hardware event_id to monitor via a performance monitoring event:
@@ -416,6 +418,7 @@ struct perf_event_attr {
__u32 aux_watermark;
__u16 sample_max_stack;
__u16 __reserved_2; /* align to __u64 */
+ __u64 aux_sample_size;
};
#define perf_flags(attr) (*(&(attr)->read_format + 1))
@@ -820,6 +823,8 @@ enum perf_event_type {
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+ * { u64 size;
+ * char data[size]; } && PERF_SAMPLE_AUX
* };
*/
PERF_RECORD_SAMPLE = 9,
@@ -952,6 +957,7 @@ enum perf_callchain_context {
#define PERF_FLAG_FD_OUTPUT (1UL << 1)
#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
+#define PERF_FLAG_FD_SAMPLE (1UL << 4) /* use fd event to sample AUX data */
#if defined(__LITTLE_ENDIAN_BITFIELD)
union perf_mem_data_src {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e1fce335a42a..70918ed33143 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -346,7 +346,8 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
- PERF_FLAG_FD_CLOEXEC)
+ PERF_FLAG_FD_CLOEXEC |\
+ PERF_FLAG_FD_SAMPLE)
/*
* branch priv levels that need permission checks
@@ -3937,6 +3938,8 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}
+static void put_event(struct perf_event *event);
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -3970,6 +3973,9 @@ static void unaccount_event(struct perf_event *event)
schedule_delayed_work(&perf_sched_work, HZ);
}
+ if (event->sample_event)
+ put_event(event->sample_event);
+
unaccount_event_cpu(event, event->cpu);
unaccount_pmu_sb_event(event);
@@ -5608,6 +5614,100 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
+/*
+ * See if we can take an AUX sample. If we can, prepare for writing
+ * the sample and return its size. In this case, perf_aux_sample_output()
+ * will undo the preparations.
+ */
+static unsigned long perf_aux_sample_size(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->sample_event;
+ struct ring_buffer *rb;
+ int *disable_count;
+
+ data->aux.size = 0;
+
+ if (!sampler || READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)
+ goto out;
+
+ if (READ_ONCE(sampler->oncpu) != smp_processor_id())
+ goto out;
+
+ /*
+ * Non-zero disable count here means that we, being the NMI
+ * context, are racing with pmu::add, pmu::del or address filter
+ * adjustment, which we want to avoid.
+ */
+ disable_count = this_cpu_ptr(sampler->pmu->pmu_disable_count);
+ if (*disable_count)
+ goto out;
+
+ /* Re-enabled in perf_aux_sample_output() */
+ perf_pmu_disable(sampler->pmu);
+
+ rb = ring_buffer_get(sampler);
+ if (!rb) {
+ perf_pmu_enable(sampler->pmu);
+ goto out;
+ }
+
+ /* Restarted in perf_aux_sample_output() */
+ sampler->pmu->stop(sampler, PERF_EF_UPDATE);
+ data->aux.to = rb->aux_head;
+
+ size = min(size, perf_aux_size(rb));
+
+ if (data->aux.to < size)
+ data->aux.from = rb->aux_nr_pages * PAGE_SIZE + data->aux.to -
+ size;
+ else
+ data->aux.from = data->aux.to - size;
+ data->aux.size = ALIGN(size, sizeof(u64));
+ ring_buffer_put(rb);
+
+out:
+ return data->aux.size;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->sample_event;
+ struct ring_buffer *rb;
+ unsigned long pad;
+ int ret;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux.size))
+ goto out_enable;
+
+ rb = ring_buffer_get(sampler);
+ if (WARN_ON_ONCE(!rb))
+ goto out_enable;
+
+ ret = rb_output_aux(rb, data->aux.from, data->aux.to,
+ (aux_copyfn)perf_output_copy, handle);
+ if (ret < 0) {
+ pr_warn_ratelimited("failed to copy trace data\n");
+ goto out;
+ }
+
+ pad = data->aux.size - ret;
+ if (pad) {
+ u64 p = 0;
+
+ perf_output_copy(handle, &p, pad);
+ }
+out:
+ ring_buffer_put(rb);
+ sampler->pmu->start(sampler, 0);
+
+out_enable:
+ perf_pmu_enable(sampler->pmu);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -5926,6 +6026,13 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux.size);
+
+ if (data->aux.size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -6112,6 +6219,32 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_aux_sample_size(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
static void __always_inline
@@ -9841,6 +9974,17 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader,
return gctx;
}
+static bool
+can_sample_for(struct perf_event *sample_event, struct perf_event *event)
+{
+ if (has_aux(sample_event) &&
+ sample_event->cpu == event->cpu &&
+ atomic_long_inc_not_zero(&sample_event->refcount))
+ return true;
+
+ return false;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -9854,6 +9998,7 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event *sample_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
struct perf_event_context *ctx, *uninitialized_var(gctx);
@@ -9924,6 +10069,8 @@ SYSCALL_DEFINE5(perf_event_open,
group_leader = group.file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
+ if (flags & PERF_FLAG_FD_SAMPLE)
+ sample_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
group_leader = NULL;
}
@@ -10146,6 +10293,15 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (sample_event) {
+ /* Grabs sample_event's reference on success */
+ if (!can_sample_for(sample_event, event)) {
+ err = -EINVAL;
+ goto err_locked;
+ }
+
+ event->sample_event = sample_event;
+ }
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
--
2.17.1