[PATCH 1/2] perf: Add closing sibling events' file descriptors

From: Alexander Shishkin
Date: Wed Jul 08 2020 - 11:19:20 EST


Currently, perf requires one file descriptor per event. In large groups,
this may mean running into the limit on open file descriptors. However,
the sibling events in a group only need file descriptors for the initial
configuration stage, after which they may not be needed any more.

This adds an opt-in flag to the perf_event_open() syscall to retain
sibling events after their file descriptors are closed. In this case, the
actual events will be closed with the group leader.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Suggested-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 7 ++
include/uapi/linux/perf_event.h | 1 +
kernel/events/core.c | 149 +++++++++++++++++++++++---------
3 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3b22db08b6fb..46666ce2c303 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -623,6 +623,11 @@ struct perf_event {
* either sufficies for read.
*/
struct list_head sibling_list;
+ /*
+ * ALLOW_CLOSED siblings that were actually closed; when the group
+ * leader goes, so should they.
+ */
+ struct list_head closed_list;
struct list_head active_list;
/*
* Node on the pinned or flexible tree located at the event context;
@@ -644,6 +649,8 @@ struct perf_event {
int event_caps;
/* The cumulative AND of all event_caps for events in this group. */
int group_caps;
+ unsigned allow_close : 1,
+ closed : 1;

struct perf_event *group_leader;
struct pmu *pmu;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 52ca2093831c..69823c0e3cbd 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1093,6 +1093,7 @@ enum perf_callchain_context {
#define PERF_FLAG_FD_OUTPUT (1UL << 1)
#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
+#define PERF_FLAG_ALLOW_CLOSE (1UL << 4) /* retain the event past fd close */

#if defined(__LITTLE_ENDIAN_BITFIELD)
union perf_mem_data_src {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..e61be9cfce98 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -352,7 +352,8 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
- PERF_FLAG_FD_CLOEXEC)
+ PERF_FLAG_FD_CLOEXEC |\
+ PERF_FLAG_ALLOW_CLOSE)

/*
* branch priv levels that need permission checks
@@ -2165,6 +2166,15 @@ static void perf_group_detach(struct perf_event *event)
* to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ if (sibling->closed) {
+ list_move(&sibling->sibling_list, &event->closed_list);
+ event->nr_siblings--;
+ continue;
+ }
+
+ /* Proceed as if it was an ordinary sibling */
+ if (sibling->allow_close)
+ sibling->allow_close = 0;

sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2313,6 +2323,7 @@ __perf_remove_from_context(struct perf_event *event,
void *info)
{
unsigned long flags = (unsigned long)info;
+ struct perf_event *sibling;

if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
@@ -2332,6 +2343,10 @@ __perf_remove_from_context(struct perf_event *event,
cpuctx->task_ctx = NULL;
}
}
+
+ flags &= ~DETACH_GROUP;
+ list_for_each_entry(sibling, &event->closed_list, sibling_list)
+ __perf_remove_from_context(sibling, cpuctx, ctx, (void *)flags);
}

/*
@@ -4906,51 +4921,12 @@ static void put_event(struct perf_event *event)
_free_event(event);
}

-/*
- * Kill an event dead; while event:refcount will preserve the event
- * object, it will not preserve its functionality. Once the last 'user'
- * gives up the object, we'll destroy the thing.
- */
-int perf_event_release_kernel(struct perf_event *event)
+static void perf_event_free_children(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ struct perf_event_context *ctx;
LIST_HEAD(free_list);

- /*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
- */
- if (!ctx) {
- WARN_ON_ONCE(event->attach_state &
- (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
- goto no_ctx;
- }
-
- if (!is_kernel_event(event))
- perf_remove_from_owner(event);
-
- ctx = perf_event_ctx_lock(event);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP);
-
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Mark this event as STATE_DEAD, there is no external reference to it
- * anymore.
- *
- * Anybody acquiring event->child_mutex after the below loop _must_
- * also see this, most importantly inherit_event() which will avoid
- * placing more children on the list.
- *
- * Thus this guarantees that we will in fact observe and kill _ALL_
- * child events.
- */
- event->state = PERF_EVENT_STATE_DEAD;
- raw_spin_unlock_irq(&ctx->lock);
-
- perf_event_ctx_unlock(event, ctx);
-
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
@@ -5016,6 +4992,82 @@ int perf_event_release_kernel(struct perf_event *event)
smp_mb(); /* pairs with wait_var_event() */
wake_up_var(var);
}
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *sibling;
+
+ /*
+ * If we got here through err_file: fput(event_file); we will not have
+ * attached to a context yet.
+ */
+ if (!ctx) {
+ WARN_ON_ONCE(event->attach_state &
+ (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+ goto no_ctx;
+ }
+
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
+
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+
+ if (event->allow_close && !event->closed) {
+ event->closed = 1;
+ perf_event_ctx_unlock(event, ctx);
+ return 0;
+ }
+
+ /*
+ * The below will also move all closed siblings to the closed_list,
+ * so that we can reap their children and remove them from the owner.
+ */
+ perf_remove_from_context(event, DETACH_GROUP);
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Mark this event as STATE_DEAD, there is no external reference to it
+ * anymore.
+ *
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list. It will also skip over closed
+ * siblings, as they are also going away together with their leader.
+ *
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
+ */
+ event->state = PERF_EVENT_STATE_DEAD;
+ raw_spin_unlock_irq(&ctx->lock);
+
+ perf_event_ctx_unlock(event, ctx);
+
+ perf_event_free_children(event);
+
+ /*
+ * The events on the closed_list are former closed siblings; they
+ * don't have file descriptors, so this is their teardown.
+ */
+ list_for_each_entry(sibling, &event->closed_list, sibling_list) {
+ if (!is_kernel_event(sibling))
+ perf_remove_from_owner(sibling);
+ perf_event_free_children(sibling);
+ /*
+ * The below may be last, or it may be raced for it
+ * by the perf_event_exit_event() path; we can do better
+ * and ensure one way or the other, but it doesn't matter,
+ * the other path is fully equipped to free events.
+ */
+ put_event(sibling);
+ }

no_ctx:
put_event(event); /* Must be the 'last' reference */
@@ -11118,6 +11170,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,

INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->closed_list);
INIT_LIST_HEAD(&event->active_list);
init_event_group(event);
INIT_LIST_HEAD(&event->rb_entry);
@@ -11718,6 +11771,14 @@ SYSCALL_DEFINE5(perf_event_open,
}
}

+ if (flags & PERF_FLAG_ALLOW_CLOSE) {
+ if (!group_leader || group_leader->group_leader != group_leader) {
+ err = -EINVAL;
+ goto err_task;
+ }
+ event->allow_close = 1;
+ }
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -12498,6 +12559,10 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->parent)
parent_event = parent_event->parent;

+ /* If group leader is getting closed, ignore its closed siblings */
+ if (!group_leader && parent_event->closed)
+ return NULL;
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
--
2.27.0