Re: Perf hotplug lockup in v4.9-rc8
From: Peter Zijlstra
Date: Fri Dec 09 2016 - 08:59:14 EST
On Wed, Dec 07, 2016 at 07:34:55PM +0100, Peter Zijlstra wrote:
> @@ -2352,6 +2357,28 @@ perf_install_in_context(struct perf_event_context *ctx,
> return;
> }
> raw_spin_unlock_irq(&ctx->lock);
> +
> + raw_spin_lock_irq(&task->pi_lock);
> + if (!(task->state == TASK_RUNNING || task->state == TASK_WAKING)) {
> + /*
> + * XXX horrific hack...
> + */
> + raw_spin_lock(&ctx->lock);
> + if (task != ctx->task) {
> + raw_spin_unlock(&ctx->lock);
> + raw_spin_unlock_irq(&task->pi_lock);
> + goto again;
> + }
> +
> + add_event_to_ctx(event, ctx);
> + raw_spin_unlock(&ctx->lock);
> + raw_spin_unlock_irq(&task->pi_lock);
> + return;
> + }
> + raw_spin_unlock_irq(&task->pi_lock);
> +
> + cond_resched();
> +
> /*
> * Since !ctx->is_active doesn't mean anything, we must IPI
> * unconditionally.
So while I went back and forth trying to make that less ugly, I figured
there was another problem.
Imagine the cpu_function_call() hitting the 'right' cpu, but not finding
the task current. It will then continue to install the event in the
context. However, that doesn't stop another CPU from pulling the task in
question from our rq and scheduling it elsewhere.
This all lead me to the below patch.. Now it has a rather large comment,
and while it represents my current thinking on the matter, I'm not at
all sure its entirely correct. I got my brain in a fair twist while
writing it.
Please as to carefully think about it.
---
kernel/events/core.c | 70 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 48 insertions(+), 22 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6ee1febdf6ff..7d9ae461c535 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2252,7 +2252,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2260,27 +2260,26 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
@@ -2331,13 +2330,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2351,12 +2373,16 @@ perf_install_in_context(struct perf_event_context *ctx,
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*