[PATCH 2/2] perf: Userspace software event and ioctl

From: Pawel Moll
Date: Thu Sep 18 2014 - 10:35:49 EST


This patch adds a PERF_COUNT_SW_USERSPACE_EVENT type,
which can be generated by user with PERF_EVENT_IOC_ENTRY
ioctl command, which injects an event of said type into
the perf buffer.

The ioctl takes a pointer to struct perf_event_userspace
as an argument. The structure begins with a 64-bit
integer type value, which determines meaning of the
following content (size/data pair). Type 0 are defined
as zero-terminated strings, other types are defined by
userspace (the perf tool will contain a list of
known values with reference implementation of data
content parsers).

Possible use cases for this feature:

- "perf_printf" like mechanism to add logging messages
to one's perf session; an example implementation:

int perf_printf(int perf_fd, const char *fmt, ...)
{
struct perf_event_userspace *event;
int size;
va_list ap;
int err;

va_start(ap, fmt);

size = vsnprintf(NULL, 0, fmt, ap) + 1;
event = malloc(sizeof(*event) + size);
if (!event) {
va_end(ap);
return -1;
}

event->type = 0;
event->size = size;
vsnprintf(event->data, size, fmt, ap);

va_end(ap);

err = ioctl(perf_fd, PERF_EVENT_IOC_USERSPACE, event);

free(event);

return err < 0 ? err : size - 1;
}

- "perf_printf" used by for perf trace tool,
where certain traced process' calls are intercepted
(eg. using LD_PRELOAD) and treated as logging
requests, with it output redirected into the
perf buffer

- synchronisation of performance data generated in
user space with the perf stream coming from the kernel.
For example, the marker can be inserted by a JIT engine
after it generated portion of the code, but before the
code is executed for the first time, allowing the
post-processor to pick the correct debugging
information.

- other example is a system profiling tool taking data
from other sources than just perf, which generates a marker
at the beginning at at the end of the session
(also possibly periodically during the session) to
synchronise kernel timestamps with clock values
obtained in userspace (gtod or raw_monotonic).

Signed-off-by: Pawel Moll <pawel.moll@xxxxxxx>
---
include/linux/perf_event.h | 8 +++++
include/uapi/linux/perf_event.h | 34 ++++++++++++++++++++-
kernel/events/core.c | 68 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 28b73b2..d904d31 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -64,6 +64,12 @@ struct perf_raw_record {
void *data;
};

+struct perf_userspace_entry {
+ u32 type;
+ u32 size;
+ u8 data[0];
+};
+
/*
* branch stack layout:
* nr: number of taken branches stored in entries[]
@@ -604,6 +610,8 @@ struct perf_sample_data {
u64 txn;
/* Raw monotonic timestamp, for userspace time correlation */
u64 clock_raw_monotonic;
+ /* Userspace-originating event */
+ struct perf_userspace_entry *user_entry;
};

static inline void perf_sample_data_init(struct perf_sample_data *data,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5a75c5..37604ae 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -110,6 +110,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_ALIGNMENT_FAULTS = 7,
PERF_COUNT_SW_EMULATION_FAULTS = 8,
PERF_COUNT_SW_DUMMY = 9,
+ PERF_COUNT_SW_USERSPACE_EVENT = 10,

PERF_COUNT_SW_MAX, /* non-ABI */
};
@@ -138,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_CLOCK_RAW_MONOTONIC = 1U << 18,
+ PERF_SAMPLE_USERSPACE_EVENT = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
@@ -337,6 +339,15 @@ struct perf_event_attr {
__u32 __reserved_2;
};

+/*
+ * Userspace-originating event to be generated with PERF_EVENT_IOC_USERSPACE
+ */
+struct perf_event_userspace {
+ __u32 type;
+ __u32 size;
+ __u8 data[0];
+};
+
#define perf_flags(attr) (*(&(attr)->read_format + 1))

/*
@@ -350,6 +361,8 @@ struct perf_event_attr {
#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_USERSPACE _IOR('$', 8, \
+ struct perf_event_userspace *)

enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -688,6 +701,25 @@ enum perf_event_type {
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 clock_raw_monotonic; } && PERF_SAMPLE_CLOCK_RAW_MONOTONIC
+ *
+ * #
+ * # Contents of USERSPACE_EVENT sample data depend on its type.
+ * #
+ * # Type 0 means that the data is a zero-terminated string that
+ * # can be printf-ed in the normal way.
+ * #
+ * # Meaning of other type values depends on the userspace
+ * # and the perf tool code contains a list of those with
+ * # reference implementations of parsers.
+ * #
+ * # Overall size of the sample (including type and size fields)
+ * # is always aligned to 8 bytes by adding padding after
+ * # the data.
+ * #
+ * { u32 type;
+ * u32 size;
+ * char data[size];
+ * char __padding[] } && PERF_SAMPLE_USERSPACE_EVENT
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f6df547..11bf1be 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3655,6 +3655,8 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_sw_userspace_entry(struct perf_event *event,
+ struct perf_event_userspace __user *arg);

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -3709,6 +3711,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);

+ case PERF_EVENT_IOC_USERSPACE:
+ return perf_sw_userspace_entry(event,
+ (struct perf_event_userspace __user *)arg);
+
default:
return -ENOTTY;
}
@@ -3728,6 +3734,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+ case _IOC_NR(PERF_EVENT_IOC_USERSPACE):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
@@ -4727,6 +4734,16 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
perf_output_put(handle, data->clock_raw_monotonic);

+ if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) {
+ int size = data->user_entry->size;
+ int padding = ALIGN(size, sizeof(u64)) - size;
+
+ perf_output_put(handle, data->user_entry->type);
+ perf_output_put(handle, size);
+ __output_copy(handle, data->user_entry->data, size);
+ perf_output_skip(handle, padding);
+ };
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -4834,6 +4851,24 @@ void perf_prepare_sample(struct perf_event_header *header,
data->stack_user_size = stack_size;
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) {
+ int size = data->user_entry->size;
+
+ /*
+ * Type 0 means zero-terminated string;
+ * make sure it is terminated
+ */
+ if (!data->user_entry->type)
+ data->user_entry->data[size - 1] = '\0';
+
+ /*
+ * The sample consist of 'type' and 'size' u32 fields
+ * followed with data and padding aligning it to 8 bytes.
+ */
+ header->size += sizeof(u32) + sizeof(u32) +
+ ALIGN(size, sizeof(u64));
+ }
}

static void perf_event_output(struct perf_event *event,
@@ -5961,6 +5996,39 @@ static struct pmu perf_swevent = {
.event_idx = perf_swevent_event_idx,
};

+static int perf_sw_userspace_entry(struct perf_event *event,
+ struct perf_event_userspace __user *arg)
+{
+ u32 size;
+ struct perf_sample_data data;
+ struct pt_regs *regs = current_pt_regs();
+ struct perf_userspace_entry *entry;
+
+ if (!arg)
+ return -EINVAL;
+
+ if (!static_key_false(&perf_swevent_enabled[
+ PERF_COUNT_SW_USERSPACE_EVENT]))
+ return 0;
+
+ BUILD_BUG_ON(sizeof(size) != sizeof(arg->size));
+ if (copy_from_user(&size, &arg->size, sizeof(size)) != 0)
+ return -EFAULT;
+
+ BUILD_BUG_ON(sizeof(*arg) != sizeof(*entry));
+ entry = memdup_user(arg, sizeof(*arg) + size);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ perf_sample_data_init(&data, 0, 0);
+ data.user_entry = entry;
+ perf_event_output(event, &data, regs);
+
+ kfree(entry);
+
+ return 0;
+}
+
#ifdef CONFIG_EVENT_TRACING

static int perf_tp_filter_match(struct perf_event *event,
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/