Re: Q: perf_install_in_context/perf_event_enable are racy?

From: Peter Zijlstra
Date: Fri Jan 28 2011 - 09:57:02 EST

Next message: Robert Jennings: "[PATCH 1/7] [v2] zram/vmalloc: Correct tunings to enable use with64K pages"
Previous message: Robert Jennings: "[PATCH 0/7][v2] zram/xvmalloc: 64K page fixes and optimizations"
In reply to: Peter Zijlstra: "Re: Q: perf_install_in_context/perf_event_enable are racy?"
Next in thread: Oleg Nesterov: "Re: Q: perf_install_in_context/perf_event_enable are racy?"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Fri, 2011-01-28 at 12:52 +0100, Peter Zijlstra wrote:
> Right, so in case the perf_event_task_sched_in() missed the assignment
> of ->perf_event_ctxp[n], then our above story falls flat on its face.
>
> Because then we can not rely on ->in_active being set for running tasks.
>
> So we need a task_curr() test under that lock, which would need
> perf_event_task_sched_out() to be done _before_ we set rq->curr = next,
> I _think_.

Ok, so how about something like this:

if task_oncpu_function_call() managed to execute the function proper,
we're done. Otherwise, if while holding the lock, task_curr() is true,
it means the task is now current and we should try again, if its not, it
cannot become current because us holding ctx->lock blocks
perf_event_task_sched_in().

Hmm?

---
include/linux/sched.h | 4 +-
kernel/perf_event.c | 23 ++++++++++-------
kernel/sched.c | 65 +++++++++++++++++++++++++++++++++++++++++++------
3 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f94..b147d73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2581,8 +2581,8 @@ static inline void inc_syscw(struct task_struct *tsk)
/*
* Call the function if the target task is executing on a CPU right now:
*/
-extern void task_oncpu_function_call(struct task_struct *p,
- void (*func) (void *info), void *info);
+extern int task_oncpu_function_call(struct task_struct *p,
+ void (*func) (void *info), void *info);

#ifdef CONFIG_MM_OWNER
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 852ae8c..0d988b8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1017,6 +1017,7 @@ perf_install_in_context(struct perf_event_context *ctx,
int cpu)
{
struct task_struct *task = ctx->task;
+ int ret;

event->ctx = ctx;

@@ -1031,25 +1032,29 @@ perf_install_in_context(struct perf_event_context *ctx,
}

retry:
- task_oncpu_function_call(task, __perf_install_in_context,
- event);
+ ret = task_oncpu_function_call(task,
+ __perf_install_in_context, event);
+
+ if (!ret)
+ return;

raw_spin_lock_irq(&ctx->lock);
+
/*
- * we need to retry the smp call.
+ * If the task_oncpu_function_call() failed, re-check task_curr() now
+ * that we hold ctx->lock(), if it is running retry the IPI.
*/
- if (ctx->is_active && list_empty(&event->group_entry)) {
+ if (task_curr(task)) {
raw_spin_unlock_irq(&ctx->lock);
goto retry;
}

/*
- * The lock prevents that this context is scheduled in so we
- * can add the event safely, if it the call above did not
- * succeed.
+ * Otherwise the lock prevents that this context is scheduled in so we
+ * can add the event safely, if it the call above did not succeed.
*/
- if (list_empty(&event->group_entry))
- add_event_to_ctx(event, ctx);
+ add_event_to_ctx(event, ctx);
+
raw_spin_unlock_irq(&ctx->lock);
}

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..3686dce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -490,7 +490,10 @@ struct rq {
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
-
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ int in_ctxsw;
+#endif
+
u64 clock;
u64 clock_task;

@@ -2265,6 +2268,34 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
#endif /* CONFIG_SMP */

+struct task_function_call {
+ struct task_struct *p;
+ void (*func)(void *info);
+ void *info;
+ int ret;
+};
+
+void task_function_trampoline(void *data)
+{
+ struct task_function_call *tfc = data;
+ struct task_struct *p = tfc->p;
+ struct rq *rq = this_rq();
+
+ tfc->ret = -EAGAIN;
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ if (rq->in_ctxsw)
+ return;
+#endif
+
+ if (rq->curr != p)
+ return;
+
+ tfc->ret = 0;
+
+ tfc->func(tfc->info);
+}
+
/**
* task_oncpu_function_call - call a function on the cpu on which a task runs
* @p: the task to evaluate
@@ -2273,17 +2304,31 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* Calls the function @func when the task is currently running. This might
* be on the current CPU, which just calls the function directly
+ *
+ * returns: 0 when @func got called
*/
-void task_oncpu_function_call(struct task_struct *p,
+int task_oncpu_function_call(struct task_struct *p,
void (*func) (void *info), void *info)
{
+ struct task_function_call data = {
+ .p = p,
+ .func = func,
+ .info = info,
+ };
int cpu;

preempt_disable();
+again:
+ data.ret = -ESRCH; /* No such (running) process */
cpu = task_cpu(p);
- if (task_curr(p))
- smp_call_function_single(cpu, func, info, 1);
+ if (task_curr(p)) {
+ smp_call_function_single(cpu, task_function_trampoline, &data, 1);
+ if (data.ret == -EAGAIN)
+ goto again;
+ }
preempt_enable();
+
+ return data.ret;
}

#ifdef CONFIG_SMP
@@ -2776,9 +2821,15 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ rq->in_ctxsw = 1;
+#endif
+ sched_info_switch(prev, next);
+ perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+ trace_sched_switch(prev, next);
}

/**
@@ -2822,6 +2873,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
perf_event_task_sched_in(current);
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ rq->in_ctxsw = 0;
local_irq_enable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
@@ -2911,7 +2963,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;

prepare_task_switch(rq, prev, next);
- trace_sched_switch(prev, next);
+
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -3989,9 +4041,6 @@ need_resched_nonpreemptible:
rq->skip_clock_update = 0;

if (likely(prev != next)) {
- sched_info_switch(prev, next);
- perf_event_task_sched_out(prev, next);
-
rq->nr_switches++;
rq->curr = next;
++*switch_count;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Robert Jennings: "[PATCH 1/7] [v2] zram/vmalloc: Correct tunings to enable use with64K pages"
Previous message: Robert Jennings: "[PATCH 0/7][v2] zram/xvmalloc: 64K page fixes and optimizations"
In reply to: Peter Zijlstra: "Re: Q: perf_install_in_context/perf_event_enable are racy?"
Next in thread: Oleg Nesterov: "Re: Q: perf_install_in_context/perf_event_enable are racy?"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]