[RFC PATCH 05/17] perf: Introduce detached events

From: Alexander Shishkin
Date: Tue Sep 05 2017 - 09:35:31 EST


There are usecases where it is desired to have perf events without the
userspace tool running in the background to keep them alive, but instead
only collect the data when it is needed, for example when an MCE event
is triggered.

This patch adds a new flag to the perf_event_open() syscall that allows
creating such events. Once created, the file descriptor can be closed
and the event continues to exist on its own. To allow access to this
event, a file is created in the tracefs, which the user can open.

Finally, when it is no longer needed, it can be destroyed by unlinking
the file.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 4 ++
include/uapi/linux/perf_event.h | 1 +
kernel/events/core.c | 138 ++++++++++++++++++++++++++++++++++++++--
kernel/events/internal.h | 6 ++
4 files changed, 142 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 82b2e3fef9..a07982f48d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -537,6 +537,7 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
#define PERF_EV_CAP_SOFTWARE BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)

+#define PERF_TRACEFS_HASH_BITS 32
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)

@@ -550,6 +551,7 @@ struct swevent_hlist {
#define PERF_ATTACH_TASK 0x04
#define PERF_ATTACH_TASK_DATA 0x08
#define PERF_ATTACH_ITRACE 0x10
+#define PERF_ATTACH_DETACHED 0x20

struct perf_cgroup;
struct ring_buffer;
@@ -672,6 +674,8 @@ struct perf_event {
struct list_head owner_entry;
struct task_struct *owner;

+ struct dentry *dent;
+
/* mmap bits */
struct mutex mmap_mutex;
atomic_t mmap_count;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 140ae638cf..89355584fa 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -946,6 +946,7 @@ enum perf_callchain_context {
#define PERF_FLAG_FD_OUTPUT (1UL << 1)
#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
+#define PERF_FLAG_DETACHED (1UL << 4) /* event w/o owner */

#if defined(__LITTLE_ENDIAN_BITFIELD)
union perf_mem_data_src {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 24099ed9e5..320070410d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -50,11 +50,14 @@
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
+#include <linux/tracefs.h>

#include "internal.h"

#include <asm/irq_regs.h>

+static struct dentry *perf_tracefs_dir;
+
typedef int (*remote_function_f)(void *);

struct remote_function_call {
@@ -346,7 +349,8 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
- PERF_FLAG_FD_CLOEXEC)
+ PERF_FLAG_FD_CLOEXEC |\
+ PERF_FLAG_DETACHED)

/*
* branch priv levels that need permission checks
@@ -4177,6 +4181,12 @@ static void _free_event(struct perf_event *event)

unaccount_event(event);

+ if (event->dent) {
+ tracefs_remove(event->dent);
+
+ event->attach_state &= ~PERF_ATTACH_DETACHED;
+ }
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -5427,8 +5437,27 @@ static int perf_fasync(int fd, struct file *filp, int on)
return 0;
}

+static int perf_open(struct inode *inode, struct file *file)
+{
+ struct perf_event *event = inode->i_private;
+ int ret;
+
+ if (WARN_ON_ONCE(!event))
+ return -EINVAL;
+
+ if (!atomic_long_inc_not_zero(&event->refcount))
+ return -ENOENT;
+
+ ret = simple_open(inode, file);
+ if (ret)
+ put_event(event);
+
+ return ret;
+}
+
static const struct file_operations perf_fops = {
.llseek = no_llseek,
+ .open = perf_open,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
@@ -9387,6 +9416,27 @@ static void account_event(struct perf_event *event)
account_pmu_sb_event(event);
}

+static int perf_event_detach(struct perf_event *event, struct task_struct *task,
+ struct mm_struct *mm)
+{
+ char *filename;
+
+ filename = kasprintf(GFP_KERNEL, "%s:%x.event",
+ task ? "task" : "cpu",
+ hash_64((u64)event, PERF_TRACEFS_HASH_BITS));
+ if (!filename)
+ return -ENOMEM;
+
+ event->dent = tracefs_create_file(filename, 0600,
+ perf_tracefs_dir,
+ event, &perf_fops);
+ kfree(filename);
+
+ if (!event->dent)
+ return -ENOMEM;
+
+ return 0;
+}
/*
* Allocate and initialize a event structure
*/
@@ -9716,6 +9766,10 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
struct ring_buffer *rb = NULL;
int ret = -EINVAL;

+ if ((event->attach_state | output_event->attach_state) &
+ PERF_ATTACH_DETACHED)
+ goto out;
+
if (!output_event)
goto set;

@@ -9876,7 +9930,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
- int move_group = 0;
+ int move_group = 0, detached = 0;
int err;
int f_flags = O_RDWR;
int cgroup_fd = -1;
@@ -9956,6 +10010,16 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}

+ if (flags & PERF_FLAG_DETACHED) {
+ err = -EINVAL;
+
+ /* output redirection and grouping are not allowed */
+ if (output_event || (group_fd != -1))
+ goto err_task;
+
+ detached = 1;
+ }
+
if (task) {
err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
if (err)
@@ -10104,6 +10168,16 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}

+ if (detached) {
+ err = perf_event_detach(event, task, NULL);
+ if (err)
+ goto err_context;
+
+ atomic_long_inc(&event->refcount);
+
+ event_file->private_data = event;
+ }
+
if (move_group) {
gctx = __perf_event_ctx_lock_double(group_leader, ctx);

@@ -10236,7 +10310,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__header_size(event);
perf_event__id_header_size(event);

- event->owner = current;
+ event->owner = detached ? TASK_TOMBSTONE : current;

perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
@@ -10250,9 +10324,11 @@ SYSCALL_DEFINE5(perf_event_open,
put_task_struct(task);
}

- mutex_lock(&current->perf_event_mutex);
- list_add_tail(&event->owner_entry, &current->perf_event_list);
- mutex_unlock(&current->perf_event_mutex);
+ if (!detached) {
+ mutex_lock(&current->perf_event_mutex);
+ list_add_tail(&event->owner_entry, &current->perf_event_list);
+ mutex_unlock(&current->perf_event_mutex);
+ }

/*
* Drop the reference on the group_event after placing the
@@ -10492,7 +10568,16 @@ perf_event_exit_event(struct perf_event *child_event,
* Parent events are governed by their filedesc, retain them.
*/
if (!parent_event) {
- perf_event_wakeup(child_event);
+ /*
+ * unless they are DETACHED, in which case we still have
+ * to dispose of them; they have an extra reference with
+ * the DETACHED state and a tracefs file
+ */
+ if (is_detached_event(child_event))
+ put_event(child_event); /* can be last */
+ else
+ perf_event_wakeup(child_event);
+
return;
}
/*
@@ -11205,6 +11290,45 @@ static int __init perf_event_sysfs_init(void)
}
device_initcall(perf_event_sysfs_init);

+static int perf_instance_nop(const char *name)
+{
+ return -EACCES;
+}
+
+static int perf_instance_unlink(const char *name)
+{
+ struct perf_event *event;
+ struct dentry *dent;
+
+ dent = lookup_one_len_unlocked(name, perf_tracefs_dir, strlen(name));
+ if (!dent)
+ return -ENOENT;
+
+ event = dent->d_inode->i_private;
+ if (!event)
+ return -EINVAL;
+
+ if (!(event->attach_state & PERF_ATTACH_CONTEXT))
+ return -EBUSY;
+
+ perf_event_release_kernel(event);
+
+ return 0;
+}
+
+static int __init perf_event_tracefs_init(void)
+{
+ perf_tracefs_dir = tracefs_create_instance_dir("perf", NULL,
+ perf_instance_nop,
+ perf_instance_nop,
+ perf_instance_unlink);
+ if (!perf_tracefs_dir)
+ return -ENOMEM;
+
+ return 0;
+}
+device_initcall(perf_event_tracefs_init);
+
#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 3e603c45eb..59136a0e98 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -126,6 +126,12 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
return rb->aux_nr_pages << PAGE_SHIFT;
}

+static inline bool is_detached_event(struct perf_event *event)
+{
+ lockdep_assert_held(&event->ctx->mutex);
+ return !!(event->attach_state & PERF_ATTACH_DETACHED);
+}
+
#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
--
2.14.1