[RFC PATCH 07/17] perf: Add pmu_info to user page

From: Alexander Shishkin
Date: Tue Sep 05 2017 - 09:42:16 EST


Allow PMUs to supply additional static information that may be required
by their decoders. Most of what Intel PT driver exports as capability
attributes (timing packet freqencies, frequency ratios etc), its decoder
needs to be able to correctly decode its binary stream. However, when
decoding Intel PT stream from a core dump, we can't rely on the sysfs
attributes, so we need to pack this information into the perf buffer,
so that the resulting core dump is self-contained.

In order to do this, we append a PMU-specific structure to the user
page. Such structures will include size, for versioning.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 17 ++++++++++
include/uapi/linux/perf_event.h | 10 ++++++
kernel/events/core.c | 27 +--------------
kernel/events/internal.h | 2 +-
kernel/events/ring_buffer.c | 75 ++++++++++++++++++++++++++++++++++-------
5 files changed, 92 insertions(+), 39 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a07982f48d..b7939e8811 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -235,6 +235,8 @@ struct hw_perf_event {

struct perf_event;

+struct pmu_info;
+
/*
* Common implementation detail of pmu::{start,commit,cancel}_txn
*/
@@ -285,6 +287,9 @@ struct pmu {
/* number of address filters this PMU can do */
unsigned int nr_addr_filters;

+ /* PMU-specific data to append to the user page */
+ const struct pmu_info *pmu_info;
+
/*
* Fully disable/enable this PMU, can be used to protect from the PMI
* as well as for lazy/batch writing of the MSRs.
@@ -508,6 +513,18 @@ struct perf_addr_filters_head {
unsigned int nr_file_filters;
};

+struct pmu_info {
+ /*
+ * Size of this structure, for versioning.
+ */
+ u32 note_size;
+
+ /*
+ * Size of the container structure, not including this one
+ */
+ u32 pmu_descsz;
+};
+
/**
* enum perf_event_active_state - the states of a event
*/
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3d64d9ea80..4cdd4fab9d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -599,6 +599,16 @@ struct perf_event_mmap_page {
__u64 aux_tail;
__u64 aux_offset;
__u64 aux_size;
+
+ /*
+ * PMU data: static info that (AUX) decoder wants to know in order to
+ * decode correctly:
+ *
+ * pmu_offset >= sizeof(struct perf_event_mmap_page)
+ * pmu_offset + pmu_size <= PAGE_SIZE
+ */
+ __u64 pmu_offset;
+ __u64 pmu_size;
};

#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fef1f97974..d62ab2d1de 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4926,28 +4926,6 @@ static void calc_timer_values(struct perf_event *event,
*running = ctx_time - event->tstamp_running;
}

-static void perf_event_init_userpage(struct perf_event *event)
-{
- struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- userpg = rb->user_page;
-
- /* Allow new userspace to detect that bit 0 is deprecated */
- userpg->cap_bit0_is_deprecated = 1;
- userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
- userpg->data_offset = PAGE_SIZE;
- userpg->data_size = perf_data_size(rb);
-
-unlock:
- rcu_read_unlock();
-}
-
void __weak arch_perf_update_userpage(
struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
@@ -5385,9 +5363,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
flags |= RING_BUFFER_WRITABLE;

if (!rb) {
- rb = rb_alloc(vma->vm_mm, nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
+ rb = rb_alloc(event, vma->vm_mm, nr_pages, flags);

if (IS_ERR_OR_NULL(rb)) {
ret = PTR_ERR(rb);
@@ -5399,7 +5375,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)

ring_buffer_attach(event, rb);

- perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 8e267d8faa..4b345ee0d4 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -76,7 +76,7 @@ static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
}

extern struct ring_buffer *
-rb_alloc(struct mm_struct *mm, int nr_pages, long watermark, int cpu,
+rb_alloc(struct perf_event *event, struct mm_struct *mm, int nr_pages,
int flags);
extern void perf_event_wakeup(struct perf_event *event);
extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index b4d7841025..d7051868d0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -268,10 +268,59 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}

+static void perf_event_init_pmu_info(struct perf_event *event,
+ struct perf_event_mmap_page *userpg)
+{
+ const struct pmu_info *pi = NULL;
+ void *ptr = (void *)userpg + sizeof(*userpg);
+ size_t size = sizeof(event->attr);
+
+ if (event->pmu && event->pmu->pmu_info) {
+ pi = event->pmu->pmu_info;
+ size += pi->pmu_descsz;
+ }
+
+ if (size + sizeof(*userpg) > PAGE_SIZE)
+ return;
+
+ userpg->pmu_offset = offset_in_page(ptr);
+ userpg->pmu_size = size;
+
+ memcpy(ptr, &event->attr, sizeof(event->attr));
+ if (pi) {
+ ptr += sizeof(event->attr);
+ memcpy(ptr, (void *)pi + pi->note_size, pi->pmu_descsz);
+ }
+}
+
+static void perf_event_init_userpage(struct perf_event *event,
+ struct ring_buffer *rb)
+{
+ struct perf_event_mmap_page *userpg;
+
+ userpg = rb->user_page;
+
+ /* Allow new userspace to detect that bit 0 is deprecated */
+ userpg->cap_bit0_is_deprecated = 1;
+ userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+ userpg->data_offset = PAGE_SIZE;
+ userpg->data_size = perf_data_size(rb);
+ if (event->attach_state & PERF_ATTACH_DETACHED) {
+ userpg->aux_offset =
+ (event->attr.detached_nr_pages + 1) << PAGE_SHIFT;
+ userpg->aux_size =
+ event->attr.detached_aux_nr_pages << PAGE_SHIFT;
+ }
+
+ perf_event_init_pmu_info(event, userpg);
+}
+
static void
-ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+ring_buffer_init(struct ring_buffer *rb, struct perf_event *event, int flags)
{
long max_size = perf_data_size(rb);
+ long watermark =
+ event->attr.watermark ? event->attr.wakeup_watermark : 0;

if (watermark)
rb->watermark = min(max_size, watermark);
@@ -295,6 +344,8 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
*/
if (!rb->nr_pages)
rb->paused = 1;
+
+ perf_event_init_userpage(event, rb);
}

void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
@@ -776,7 +827,7 @@ int rb_alloc_detached(struct perf_event *event)
* Use overwrite mode (!RING_BUFFER_WRITABLE) for both data and aux
* areas as we don't want wakeups or interrupts.
*/
- rb = rb_alloc(NULL, nr_pages, 0, event->cpu, 0);
+ rb = rb_alloc(event, NULL, nr_pages, 0);
if (IS_ERR(rb))
return PTR_ERR(rb);

@@ -841,8 +892,8 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}

-struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
- int cpu, int flags)
+struct ring_buffer *rb_alloc(struct perf_event *event, struct mm_struct *mm,
+ int nr_pages, int flags)
{
unsigned long size = offsetof(struct ring_buffer, data_pages[nr_pages]);
struct ring_buffer *rb;
@@ -850,26 +901,27 @@ struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,

rb = kzalloc(size, GFP_KERNEL);
if (!rb)
- goto fail;
+ return ERR_PTR(-ENOMEM);

ret = ring_buffer_account(rb, mm, nr_pages, false);
if (ret)
goto fail_free_rb;

ret = -ENOMEM;
- rb->user_page = perf_mmap_alloc_page(cpu);
+ rb->user_page = perf_mmap_alloc_page(event->cpu);
if (!rb->user_page)
goto fail_unaccount;

for (i = 0; i < nr_pages; i++) {
- rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+ rb->data_pages[i] = perf_mmap_alloc_page(event->cpu);
+
if (!rb->data_pages[i])
goto fail_data_pages;
}

rb->nr_pages = nr_pages;

- ring_buffer_init(rb, watermark, flags);
+ ring_buffer_init(rb, event, flags);

return rb;

@@ -885,7 +937,6 @@ struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
fail_free_rb:
kfree(rb);

-fail:
return ERR_PTR(ret);
}

@@ -953,8 +1004,8 @@ void rb_free(struct ring_buffer *rb)
schedule_work(&rb->work);
}

-struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
- int cpu, int flags)
+struct ring_buffer *rb_alloc(struct perf_event *event, struct mm_struct *mm,
+ int nr_pages, int flags)
{
unsigned long size = offsetof(struct ring_buffer, data_pages[1]);
struct ring_buffer *rb;
@@ -983,7 +1034,7 @@ struct ring_buffer *rb_alloc(struct mm_struct *mm, int nr_pages, long watermark,
rb->page_order = ilog2(nr_pages);
}

- ring_buffer_init(rb, watermark, flags);
+ ring_buffer_init(rb, event, flags);

return rb;

--
2.14.1