Re: perf counter issue -WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));

From: Peter Zijlstra
Date: Fri May 15 2009 - 16:27:37 EST


On Fri, 2009-05-15 at 19:37 +0200, Peter Zijlstra wrote:
> On Fri, 2009-05-15 at 18:13 +0200, Peter Zijlstra wrote:
> > On Fri, 2009-05-15 at 21:28 +0530, Srivatsa Vaddagiri wrote:
> > > On Fri, May 15, 2009 at 11:51:44AM -0300, Arnaldo Carvalho de Melo wrote:
> > > > > hm, is there a reproducer perhaps? Is there some class file i could
> > > > > run with specific parameters to reproduce it?
> > > >
> > > > I'll try this with some java benchmarks we have, AMQP related, lets see
> > > > if I can reproduce it.
> > >
> > > I tried this with SPECJbb - which I am not at liberty to distribute
> > > unfortunately. I will try and recreate it with volanomark or other open
> > > source benchmarks.
> >
> > I could indeed reproduce with vmark. Am poking at it.. still clueless
> > though ;-)
>
> [root@opteron tmp]# cat foo.c
> #include <pthread.h>
> #include <unistd.h>
>
> void *thread(void *arg)
> {
> sleep(5);
> return NULL;
> }
>
> void main(void)
> {
> pthread_t thr;
> pthread_create(&thr, NULL, thread, NULL);
> }
>
> The above instantly triggers it. It appears we fail to cleanup on the
> reparent path. I'll go root around in exit.c.

OK, so the cleanup isn't solid.. I've been poking at things, and below
is the current state of my tinkering, but it seems to make things
worse...

With only the callback in do_exit() the above test works but hackbench
fails, with only the call in wait_task_zombie() hackbench works and the
above fails.

With both, we segfault the kernel on a list op on either :-)

Will continue poking tomorrow and such, unless someone beats me to it.


---
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *co
}

list_add_rcu(&counter->event_entry, &ctx->event_list);
+ ctx->nr_counters++;
}

static void
@@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *co
{
struct perf_counter *sibling, *tmp;

+ ctx->nr_counters--;
+
list_del_init(&counter->list_entry);
list_del_rcu(&counter->event_entry);

@@ -209,7 +212,6 @@ static void __perf_counter_remove_from_c
counter_sched_out(counter, cpuctx, ctx);

counter->task = NULL;
- ctx->nr_counters--;

/*
* Protect the list operation against NMI by disabling the
@@ -276,7 +278,6 @@ retry:
* succeed.
*/
if (!list_empty(&counter->list_entry)) {
- ctx->nr_counters--;
list_del_counter(counter, ctx);
counter->task = NULL;
}
@@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct pe
struct perf_counter_context *ctx)
{
list_add_counter(counter, ctx);
- ctx->nr_counters++;
counter->prev_state = PERF_COUNTER_STATE_OFF;
counter->tstamp_enabled = ctx->time;
counter->tstamp_running = ctx->time;
@@ -3252,8 +3252,8 @@ __perf_counter_exit_task(struct task_str
*/
if (child != current) {
wait_task_inactive(child, 0);
- list_del_init(&child_counter->list_entry);
update_counter_times(child_counter);
+ list_del_counter(child_counter, child_ctx);
} else {
struct perf_cpu_context *cpuctx;
unsigned long flags;
@@ -3272,9 +3272,7 @@ __perf_counter_exit_task(struct task_str
group_sched_out(child_counter, cpuctx, child_ctx);
update_counter_times(child_counter);

- list_del_init(&child_counter->list_entry);
-
- child_ctx->nr_counters--;
+ list_del_counter(child_counter, child_ctx);

perf_enable();
local_irq_restore(flags);
@@ -3288,13 +3286,6 @@ __perf_counter_exit_task(struct task_str
*/
if (parent_counter) {
sync_child_counter(child_counter, parent_counter);
- list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
- list_entry) {
- if (sub->parent) {
- sync_child_counter(sub, sub->parent);
- free_counter(sub);
- }
- }
free_counter(child_counter);
}
}
@@ -3315,9 +3306,18 @@ void perf_counter_exit_task(struct task_
if (likely(!child_ctx->nr_counters))
return;

+again:
list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
list_entry)
__perf_counter_exit_task(child, child_counter, child_ctx);
+
+ /*
+ * If the last counter was a group counter, it will have appended all
+ * its siblings to the list, but we obtained 'tmp' before that which
+ * will still point to the list head terminating the iteration.
+ */
+ if (!list_empty(&child_ctx->counter_list))
+ goto again;
}

/*
Index: linux-2.6/kernel/exit.c
===================================================================
--- linux-2.6.orig/kernel/exit.c
+++ linux-2.6/kernel/exit.c
@@ -155,7 +155,7 @@ static void delayed_put_task_struct(stru
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

#ifdef CONFIG_PERF_COUNTERS
- WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+ WARN_ON(!list_empty(&tsk->perf_counter_ctx.counter_list));
#endif
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -1002,6 +1002,8 @@ NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);

+ perf_counter_exit_task(tsk);
+
preempt_disable();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/