Re: 2.6.22-rc6 spurious hangs

From: Oleg Nesterov
Date: Thu Jun 28 2007 - 10:47:34 EST


On 06/28, Thomas Sattler wrote:
>
> As Ingo told me I run 'echo t > /proc/sysrq-trigger' this time. The
> corresponding part of my syslogs is attached, as well as my kernel config.

xs_connect() and release_dev() are blocked on flush_workqueue(). Perhaps
this is OK, but may be not.

Could you try the patch below? It dumps some info when flush_workqueue()
hangs.

Oleg.

--- OLD/kernel/sched.c~TST 2007-04-05 12:20:35.000000000 +0400
+++ OLD/kernel/sched.c 2007-06-02 15:41:53.000000000 +0400
@@ -4177,6 +4177,20 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}

+struct task_struct *get_cpu_curr(int cpu)
+{
+ unsigned long flags;
+ struct task_struct *curr;
+ struct rq *rq = cpu_rq(cpu);
+
+ spin_lock_irqsave(&rq->lock, flags);
+ curr = rq->curr;
+ get_task_struct(curr);
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ return curr;
+}
+
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
--- OLD/kernel/workqueue.c~TST 2007-06-02 13:34:57.000000000 +0400
+++ OLD/kernel/workqueue.c 2007-06-03 11:28:54.000000000 +0400
@@ -49,6 +49,7 @@ struct cpu_workqueue_struct {
struct task_struct *thread;

int run_depth; /* Detect run_workqueue() recursion depth */
+ int jobs;
} ____cacheline_aligned;

/*
@@ -253,6 +254,7 @@ static void run_workqueue(struct cpu_wor

cwq->current_work = work;
list_del_init(cwq->worklist.next);
+ cwq->jobs++;
spin_unlock_irq(&cwq->lock);

BUG_ON(get_wq_data(work) != cwq);
@@ -328,6 +330,47 @@ static void insert_wq_barrier(struct cpu
insert_work(cwq, &barr->work, tail);
}

+extern struct task_struct *get_cpu_curr(int cpu);
+
+static void flush_wait(struct cpu_workqueue_struct *cwq, struct completion *done)
+{
+ const int cpu = task_cpu(cwq->thread);
+ struct task_struct *curr;
+ struct work_struct *work;
+ int old_pid, state, jobs;
+
+again:
+ state = cwq->thread->state;
+ work = cwq->current_work;
+ jobs = cwq->jobs;
+
+ curr = get_cpu_curr(cpu);
+ old_pid = curr->pid;
+ put_task_struct(curr);
+
+ if (wait_for_completion_timeout(done, HZ * 30))
+ return;
+
+ printk(KERN_ERR "ERR!! %s flush hang: %p %p %d %d %d %d\n", cwq->thread->comm,
+ work, cwq->current_work, jobs, cwq->jobs,
+ state, (int)cwq->thread->state);
+
+ curr = get_cpu_curr(cpu);
+ printk(KERN_ERR "CURR: %d %d %s %ld %ld\n", old_pid, curr->pid,
+ curr->comm, curr->nivcsw, curr->nvcsw);
+ put_task_struct(curr);
+
+ spin_lock_irq(&cwq->lock);
+ list_for_each_entry(work, &cwq->worklist, entry)
+ print_symbol(" %s\n", (unsigned long) work->func);
+ printk(" ----\n");
+ if (cwq->current_work)
+ print_symbol(" %s\n", (unsigned long) cwq->current_work->func);
+ spin_unlock_irq(&cwq->lock);
+
+ goto again;
+}
+
static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
int active;
@@ -351,7 +394,7 @@ static int flush_cpu_workqueue(struct cp
spin_unlock_irq(&cwq->lock);

if (active)
- wait_for_completion(&barr.done);
+ flush_wait(cwq, &barr.done);
}

return active;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/