[RFC][PATCH 10/12] perf: Fix task context scheduling

From: Peter Zijlstra
Date: Mon Jan 11 2016 - 11:36:36 EST


There is a very nasty problem wrt disabling the perf task scheduling
hooks.

Currently we {set,clear} ctx->is_active on every
__perf_event_task_sched_{in,out}, _however_ this means that if we
disable these calls we'll have task contexts with ->is_active set that
are not active and 'active' task contexts without ->is_active set.

This can result in event_function_call() looping on the ctx->is_active
condition basically indefinitely.

Resolve this by changing things such that contexts without events do
not set ->is_active like we used to. From this invariant it trivially
follows that if there are no (task) events, every task ctx is inactive
and disabling the context switch hooks is harmless.

This leaves two places that need attention (and already had
accumulated weird and wonderful hacks to work around, without
recognising this actual problem).

Namely:

- perf_install_in_context() will need to deal with installing events
in an inactive context, meaning it cannot rely on ctx-is_active for
its IPIs.

- perf_remove_from_context() will have to mark a context as inactive
when it removes the last event.

For specific detail, see the patch/comments.

Cc: Andi Kleen <andi@xxxxxxxxxxxxxx>
Cc: Dmitry Vyukov <dvyukov@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/events/core.c | 155 +++++++++++++++++++++++++++++----------------------
1 file changed, 91 insertions(+), 64 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -126,6 +126,38 @@ static int cpu_function_call(int cpu, re
return data.ret;
}

+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ * - removing the last event from a task ctx; this is relatively straight
+ * forward and is done in __perf_remove_from_context.
+ *
+ * - adding the first event to a task ctx; this is tricky because we cannot
+ * rely on ctx->is_active and therefore cannot use event_function_call().
+ * See perf_install_in_context().
+ *
+ * This is because we need a ctx->lock serialized variable (ctx->is_active)
+ * to reliably determine if a particular task/context is scheduled in. The
+ * task_curr() use in task_function_call() is racy in that a remote context
+ * switch is not a single atomic operation.
+ *
+ * As is, the situation is 'safe' because we set rq->curr before we do the
+ * actual context switch. This means that task_curr() will fail early, but
+ * we'll continue spinning on ctx->is_active until we've passed
+ * perf_event_task_sched_out().
+ *
+ * Without this ctx->lock serialized variable we could have race where we find
+ * the task (and hence the context) would not be active while in fact they are.
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
static void event_function_call(struct perf_event *event,
int (*active)(void *),
void (*inactive)(void *),
@@ -1686,9 +1718,13 @@ static int __perf_remove_from_context(vo
if (re->detach_group)
perf_group_detach(event);
list_del_event(event, ctx);
- if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+
+ if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
- cpuctx->task_ctx = NULL;
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ cpuctx->task_ctx = NULL;
+ }
}
raw_spin_unlock(&ctx->lock);

@@ -2056,18 +2092,6 @@ static void perf_event_sched_in(struct p
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}

-static void ___perf_install_in_context(void *info)
-{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
-
- /*
- * Since the task isn't running, its safe to add the event, us holding
- * the ctx->lock ensures the task won't get scheduled in.
- */
- add_event_to_ctx(event, ctx);
-}
-
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx)
{
@@ -2086,55 +2110,27 @@ static void ctx_resched(struct perf_cpu_
*/
static int __perf_install_in_context(void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
+ struct perf_event_context *ctx = info;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- struct task_struct *task = current;

- perf_ctx_lock(cpuctx, task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- /*
- * If there was an active task_ctx schedule it out.
- */
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx);
+ if (ctx->task) {
+ /*
+ * If we hit the 'wrong' task, we've since scheduled and
+ * everything should be sorted, nothing to do!
+ */
+ if (ctx->task != current)
+ return 0;

- /*
- * If the context we're installing events in is not the
- * active task_ctx, flip them.
- */
- if (ctx->task && task_ctx != ctx) {
- if (task_ctx)
- raw_spin_unlock(&task_ctx->lock);
- raw_spin_lock(&ctx->lock);
+ /*
+ * If task_ctx is set, it had better be to us.
+ */
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
task_ctx = ctx;
}

- if (task_ctx) {
- cpuctx->task_ctx = task_ctx;
- task = task_ctx->task;
- }
-
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-
- update_context_time(ctx);
- /*
- * update cgrp time only if current cgrp
- * matches event->cgrp. Must be done before
- * calling add_event_to_ctx()
- */
- update_cgrp_time_from_event(event);
-
- add_event_to_ctx(event, ctx);
-
- /*
- * Schedule everything back in
- */
- perf_event_sched_in(cpuctx, task_ctx, task);
-
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_lock(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx);
perf_ctx_unlock(cpuctx, task_ctx);

return 0;
@@ -2148,14 +2144,38 @@ perf_install_in_context(struct perf_even
struct perf_event *event,
int cpu)
{
+ struct task_struct *task = NULL;
+
lockdep_assert_held(&ctx->mutex);

event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;

- event_function_call(event, __perf_install_in_context,
- ___perf_install_in_context, event);
+ /*
+ * Installing events is tricky because we cannot rely on ctx->is_active
+ * to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * So what we do is we add the event to the list here, which will allow
+ * a future context switch to DTRT and then send a racy IPI. If the IPI
+ * fails to hit the right task, this means a context switch must have
+ * happened and that will have taken care of business.
+ */
+ raw_spin_lock_irq(&ctx->lock);
+ update_context_time(ctx);
+ /*
+ * Update cgrp time only if current cgrp matches event->cgrp.
+ * Must be done before calling add_event_to_ctx().
+ */
+ update_cgrp_time_from_event(event);
+ add_event_to_ctx(event, ctx);
+ task = ctx->task;
+ raw_spin_unlock_irq(&ctx->lock);
+
+ if (task)
+ task_function_call(task, __perf_install_in_context, ctx);
+ else
+ cpu_function_call(cpu, __perf_install_in_context, ctx);
}

/*
@@ -2328,6 +2348,16 @@ static void ctx_sched_out(struct perf_ev

lockdep_assert_held(&ctx->lock);

+ if (likely(!ctx->nr_events)) {
+ /*
+ * See __perf_remove_from_context().
+ */
+ WARN_ON_ONCE(ctx->is_active);
+ if (ctx->task)
+ WARN_ON_ONCE(cpuctx->task_ctx);
+ return;
+ }
+
ctx->is_active &= ~event_type;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -2335,9 +2365,6 @@ static void ctx_sched_out(struct perf_ev
cpuctx->task_ctx = NULL;
}

- if (likely(!ctx->nr_events))
- return;
-
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
@@ -2716,6 +2743,9 @@ ctx_sched_in(struct perf_event_context *

lockdep_assert_held(&ctx->lock);

+ if (likely(!ctx->nr_events))
+ return;
+
ctx->is_active |= event_type;
if (ctx->task) {
if (!is_active)
@@ -2724,9 +2754,6 @@ ctx_sched_in(struct perf_event_context *
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
}

- if (likely(!ctx->nr_events))
- return;
-
now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);