Re: [PATCH v2] perf: Synchronously cleanup child events

From: Peter Zijlstra
Date: Mon Jan 25 2016 - 16:04:22 EST


On Mon, Jan 25, 2016 at 03:54:14PM +0100, Peter Zijlstra wrote:
> Alexander, Alexei,
>
> How about the below? That uses event->state == PERF_EVENT_STATE_EXIT to
> indicate the event has been given up by its 'owner' and decouples us
> from the actual event->owner logic.
>
> This retains the event->owner and event->owner_list thing purely for the
> prclt(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE) calls, but does give
> us strict 'owner' semantics in that:
>
> struct perf_event *my_event = perf_event_create_kernel_counter();
>
> /* ... */
>
> perf_event_release_kernel(my_event);
>
> Or
>
> int fd = sys_perf_event_open(...);
>
> close(fd); /* last, calls fops::release */
>
> Will destroy the event dead. event::refcount will 'retain' the object
> but it will become non functional and is strictly meant as a temporal
> existence guarantee (for when RCU isn't good enough).
>
> So this should restore the scm_rights case, which preserves the fd but
> could result in not having event->owner (and therefore being removed
> from its owner_list), which is fine.
>
> BPF still needs to get fixed to use filedesc references instead.

Still no BPF, but this one actually 'works', as in it doesn't have the
blatant exit races and has survived a few hours of runtime.

---
include/linux/perf_event.h | 3
kernel/events/core.c | 304 ++++++++++++++++++++++-----------------------
2 files changed, 150 insertions(+), 157 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,9 +634,6 @@ struct perf_event_context {
int nr_cgroups; /* cgroup evts */
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
-
- struct delayed_work orphans_remove;
- bool orphans_remove_sched;
};

/*
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@

#include <asm/irq_regs.h>

-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);

struct remote_function_call {
@@ -1086,8 +1084,8 @@ static void put_ctx(struct perf_event_co
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
@@ -1646,45 +1644,11 @@ static void perf_group_detach(struct per
perf_event__header_size(tmp);
}

-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
+ return event->state == PERF_EVENT_STATE_EXIT;
}

-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
-}
-
-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
@@ -1745,9 +1709,6 @@ event_sched_out(struct perf_event *event
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;

- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}

@@ -1771,6 +1732,9 @@ group_sched_out(struct perf_event *group
cpuctx->exclusive = 0;
}

+#define DETACH_GROUP 0x01
+#define DETACH_STATE 0x02
+
/*
* Cross CPU call to remove a performance event
*
@@ -1783,13 +1747,19 @@ __perf_remove_from_context(struct perf_e
struct perf_event_context *ctx,
void *info)
{
- bool detach_group = (unsigned long)info;
+ unsigned long flags = (unsigned long)info;

event_sched_out(event, cpuctx, ctx);
- if (detach_group)
+
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);

+ if (flags & DETACH_STATE) {
+ event->state = PERF_EVENT_STATE_EXIT;
+ perf_event_wakeup(event);
+ }
+
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
if (ctx->task) {
@@ -1809,12 +1779,11 @@ __perf_remove_from_context(struct perf_e
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
lockdep_assert_held(&event->ctx->mutex);

- event_function_call(event, __perf_remove_from_context,
- (void *)(unsigned long)detach_group);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}

/*
@@ -1980,9 +1949,6 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;

- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);

@@ -2253,7 +2219,8 @@ static void __perf_event_enable(struct p
struct perf_event *leader = event->group_leader;
struct perf_event_context *task_ctx;

- if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
return;

update_context_time(ctx);
@@ -2298,7 +2265,8 @@ static void _perf_event_enable(struct pe
struct perf_event_context *ctx = event->ctx;

raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -3363,7 +3331,6 @@ static void __perf_event_init_context(st
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}

static struct perf_event_context *
@@ -3665,29 +3632,6 @@ static bool exclusive_event_installable(
return true;
}

-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3709,7 +3653,25 @@ static void _free_event(struct perf_even
if (is_cgroup_event(event))
perf_detach_cgroup(event);

- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}

/*
@@ -3771,8 +3733,10 @@ static void perf_remove_from_owner(struc
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ event->owner = NULL;
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3780,11 +3744,23 @@ static void perf_remove_from_owner(struc

static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;

+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *child, *tmp;
+ LIST_HEAD(child_list);
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);

@@ -3802,14 +3778,61 @@ static void put_event(struct perf_event
*/
ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
+ perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
perf_event_ctx_unlock(event, ctx);

- _free_event(event);
-}
+ /*
+ * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
+ * either from the above perf_remove_from_context() or through
+ * __perf_event_exit_task().
+ *
+ * Therefore, anybody acquireing event->child_mutex after the below
+ * list_splice_init() _must_ also see this, most importantly
+ * inherit_event() which will avoid placing more children on the
+ * list.
+ *
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
+ */
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);

-int perf_event_release_kernel(struct perf_event *event)
-{
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
+
+ ctx = lockless_dereference(child->ctx);
+ /*
+ * must stay valid, see __perf_event_exit_task() holding
+ * event->child_mutex.
+ */
+ get_ctx(ctx);
+
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_del(&child->child_list);
+ free_event(child);
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+ /* Must be the last reference */
put_event(event);
return 0;
}
@@ -3820,46 +3843,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_ker
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}

-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -8432,11 +8419,11 @@ SYSCALL_DEFINE5(perf_event_open,
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);

list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}

@@ -8616,7 +8603,7 @@ void perf_pmu_migrate_context(struct pmu
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -8665,7 +8652,7 @@ void perf_pmu_migrate_context(struct pmu
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event,
- struct task_struct *child)
+ struct task_struct *child)
{
struct perf_event *parent_event = child_event->parent;
u64 child_val;
@@ -8684,25 +8671,6 @@ static void sync_child_event(struct perf
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);

- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}

static void
@@ -8710,6 +8678,11 @@ __perf_event_exit_task(struct perf_event
struct perf_event_context *child_ctx,
struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
+ if (parent_event)
+ mutex_lock(&parent_event->child_mutex);
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
@@ -8728,20 +8701,38 @@ __perf_event_exit_task(struct perf_event
if (!!child_event->parent)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
raw_spin_unlock_irq(&child_ctx->lock);

/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!child_event->parent) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * This is a child event, cleanup and free.
+ */
+
+ /*
+ * Fold delta back into parent counts.
+ */
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list.
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}

static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -8973,8 +8964,15 @@ inherit_event(struct perf_event *parent_
if (IS_ERR(child_event))
return child_event;

+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel().
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
@@ -9022,8 +9020,6 @@ inherit_event(struct perf_event *parent_
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);