[PATCH 05/12] perf: Fix cloning

From: Peter Zijlstra
Date: Wed Feb 24 2016 - 12:56:00 EST


Alexander reported that when the 'original' context gets destroyed, no
new clones happen.

This can happen irrespective of the ctx switch optimization, any task
can die, even the parent, and we want to continue monitoring the task
hierarchy until we either close the event or no tasks are left in the
hierarchy.

perf_event_init_context() will attempt to pin the 'parent' context
during clone(). At that point current is the parent, and since current
cannot have exited while executing clone(), its context cannot have
passed through perf_event_exit_task_context(). Therefore
perf_pin_task_context() cannot observe ctx->task == TASK_TOMBSTONE.

However, since inherit_event() does:

if (parent_event->parent)
parent_event = parent_event->parent;

it looks at the 'original' event when it does: is_orphaned_event().
This can return true if the context that contains the this event has
passed through perf_event_exit_task_context(). And thus we'll fail to
clone the perf context.

Fix this by adding a new state: STATE_DEAD, which is set by
perf_release() to indicate that the filedesc (or kernel reference) is
dead and there are no observers for our data left.

Only for STATE_DEAD will is_orphaned_event() be true and inhibit
cloning.

STATE_EXIT is otherwise preserved such that is_event_hup() remains
functional and will report when the observed task hierarchy becomes
empty.

Fixes: c6e5b73242d2 ("perf: Synchronously clean up child events")
Reported-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Tested-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Reviewed-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 29 ++++++++++++++---------------
2 files changed, 15 insertions(+), 15 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -397,6 +397,7 @@ struct pmu {
* enum perf_event_active_state - the states of a event
*/
enum perf_event_active_state {
+ PERF_EVENT_STATE_DEAD = -4,
PERF_EVENT_STATE_EXIT = -3,
PERF_EVENT_STATE_ERROR = -2,
PERF_EVENT_STATE_OFF = -1,
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,7 +1645,7 @@ static void perf_group_detach(struct per

static bool is_orphaned_event(struct perf_event *event)
{
- return event->state == PERF_EVENT_STATE_EXIT;
+ return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int pmu_filter_match(struct perf_event *event)
@@ -1732,7 +1732,6 @@ group_sched_out(struct perf_event *group
}

#define DETACH_GROUP 0x01UL
-#define DETACH_STATE 0x02UL

/*
* Cross CPU call to remove a performance event
@@ -1752,8 +1751,6 @@ __perf_remove_from_context(struct perf_e
if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_STATE)
- event->state = PERF_EVENT_STATE_EXIT;

if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
@@ -3776,22 +3773,24 @@ int perf_event_release_kernel(struct per

ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
- perf_event_ctx_unlock(event, ctx);
+ perf_remove_from_context(event, DETACH_GROUP);

+ raw_spin_lock_irq(&ctx->lock);
/*
- * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
- * either from the above perf_remove_from_context() or through
- * perf_event_exit_event().
+ * Mark this even as STATE_DEAD, there is no external reference to it
+ * anymore.
*
- * Therefore, anybody acquiring event->child_mutex after the below
- * loop _must_ also see this, most importantly inherit_event() which
- * will avoid placing more children on the list.
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list.
*
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
+ event->state = PERF_EVENT_STATE_DEAD;
+ raw_spin_unlock_irq(&ctx->lock);
+
+ perf_event_ctx_unlock(event, ctx);

again:
mutex_lock(&event->child_mutex);
@@ -4004,7 +4003,7 @@ static bool is_event_hup(struct perf_eve
{
bool no_children;

- if (event->state != PERF_EVENT_STATE_EXIT)
+ if (event->state > PERF_EVENT_STATE_EXIT)
return false;

mutex_lock(&event->child_mutex);
@@ -8731,7 +8730,7 @@ perf_event_exit_event(struct perf_event
if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
- child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
+ child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);

/*