Re: [PATCH] avoid race condition in pick_next_task_fair inkernel/sched_fair.c

From: Peter Zijlstra
Date: Wed Dec 22 2010 - 15:36:40 EST


On Wed, 2010-12-22 at 18:08 +0100, Peter Zijlstra wrote:
> On Wed, 2010-12-22 at 16:14 +0100, Miklos Vajna wrote:
> > kernel-build is a git build using the config I already sent and after a
> > 'git checkout v2.6.36'. I can try to build master as well, so far what I
> > saw is that the bug occurs there as well, but less frequently, so maybe
> > that's a bit harder to debug.
>
> Right, so I've got v2.6.36 exploding, current -git and -tip won't
> explode for me, tried both about 20 times. So I'll try and figure out
> wth makes .36 explode and then see if further kernel still suffer that
> problem.

OK, so something goes wrong with load accounting,

Full log at: http://programming.kicks-ass.net/sekrit/log4

I print the entire runqueue before and after each op, the last good one
is a post-print:

kworker/-9 0d..2. 2013644us : pick_next_task_fair: picked: f6a8de5c, kworker/u:1/1251
kworker/-9 0d..2. 2013644us : print_runqueue <-pick_next_task_fair
kworker/-9 0d..2. 2013645us : __print_runqueue: cfs_rq: c2407c34, nr: 3, load: 3072
kworker/-9 0d..2. 2013645us : __print_runqueue: curr: f6a8de5c, comm: kworker/u:1/1251, load: 1024
kworker/-9 0d..2. 2013646us : __print_runqueue: se: f69e6300, load: 1024,
kworker/-9 0d..2. 2013646us : __print_runqueue: cfs_rq: f69e6540, nr: 2, load: 2048
kworker/-9 0d..2. 2013646us : __print_runqueue: curr: (null)
kworker/-9 0d..2. 2013647us : __print_runqueue: se: f69e65a0, load: 1024,
kworker/-9 0d..2. 2013647us : __print_runqueue: cfs_rq: f69e6240, nr: 1, load: 1024
kworker/-9 0d..2. 2013648us : __print_runqueue: curr: (null)
kworker/-9 0d..2. 2013648us : __print_runqueue: se: f725916c, comm: loadkeys/1225, load: 1024
kworker/-9 0d..2. 2013649us : __print_runqueue: se: f69e6660, load: 1024,
kworker/-9 0d..2. 2013649us : __print_runqueue: cfs_rq: f69e6600, nr: 1, load: 1024
kworker/-9 0d..2. 2013650us : __print_runqueue: curr: (null)
kworker/-9 0d..2. 2013650us : __print_runqueue: se: f6a8da0c, comm: systemd-tmpfile/1246, load: 1024
kworker/-9 0d..2. 2013651us : __print_runqueue: se: f6a8cd1c, comm: kworker/u:1/1252, load: 1024


And then the next pre-print looks like:

systemd--1251 0d..5. 2015398us : enqueue_task_fair <-enqueue_task
systemd--1251 0d..5. 2015398us : print_runqueue <-enqueue_task_fair
systemd--1251 0d..5. 2015399us : __print_runqueue: cfs_rq: c2407c34, nr: 3, load: 3072
systemd--1251 0d..5. 2015400us : __print_runqueue: curr: f6a8de5c, comm: systemd-cgroups/1251, load: 1024
systemd--1251 0d..5. 2015401us : __print_runqueue: se: f69e6300, load: 1024,
systemd--1251 0d..5. 2015401us : __print_runqueue: cfs_rq: f69e6540, nr: 2, load: 2048
systemd--1251 0d..5. 2015402us : __print_runqueue: curr: (null)
systemd--1251 0d..5. 2015402us : __print_runqueue: se: f69e65a0, load: 4137574976,
systemd--1251 0d..5. 2015402us : __print_runqueue: cfs_rq: f69e6240, nr: 1, load: 4137575360
systemd--1251 0d..5. 2015403us : __print_runqueue: curr: (null)
systemd--1251 0d..5. 2015404us : __print_runqueue: se: f725916c, comm: loadkeys/1225, load: 1024
systemd--1251 0d..5. 2015404us : __print_runqueue: se: f69e6660, load: 1024,
systemd--1251 0d..5. 2015404us : __print_runqueue: cfs_rq: f69e6600, nr: 1, load: 1024
systemd--1251 0d..5. 2015405us : __print_runqueue: curr: (null)
systemd--1251 0d..5. 2015406us : __print_runqueue: se: f6a8da0c, comm: systemd-tmpfile/1246, load: 1024
systemd--1251 0d..5. 2015406us : __print_runqueue: se: f6a8cd1c, comm: kworker/u:1/1252, load: 1024


The load value being crazy like this leads to dequeue_task_fair() making
weird decisions since it checks for load==0 to dequeue groups.

Not doing that, then leads to 'dead' groups still being enqueued and
eventually referencing freed memory etc..

I've no clue why any of this happens, it could be random scribbles,
except the se/cfs_rq combo are always off together (although not always
by the same amount).

Anyway, my brain needs sleep, I'll continue poking tomorrow.

---
kernel/sched_fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..8b8b038 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,6 +333,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;

+ trace_printk("%p to %p, %ld tasks\n", se, cfs_rq, cfs_rq->nr_running);
+
/*
* Find the right place in the rbtree:
*/
@@ -364,6 +366,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ trace_printk("%p from %p, %ld left\n", se, cfs_rq, cfs_rq->nr_running);
+
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;

@@ -394,6 +398,53 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
return rb_entry(last, struct sched_entity, run_node);
}

+#define INDENT(i) ((i)*2), " "
+
+static void notrace __print_runqueue(struct cfs_rq *cfs_rq, int indent);
+
+static void __always_inline
+__print_entity(struct sched_entity *se, char *name, int indent)
+{
+ if (!se) {
+ trace_printk("%.*s %s: %p\n", INDENT(indent), name, se);
+ return;
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ trace_printk("%.*s %s: %p, comm: %s/%d, load: %ld\n",
+ INDENT(indent), name,
+ se, p->comm, p->pid, se->load.weight);
+ } else {
+ trace_printk("%.*s %s: %p, load: %ld, ",
+ INDENT(indent), name,
+ se, se->load.weight);
+ __print_runqueue(group_cfs_rq(se), indent+1);
+ }
+}
+
+static void notrace __print_runqueue(struct cfs_rq *cfs_rq, int indent)
+{
+ struct rb_node *n;
+
+ trace_printk("%.*s cfs_rq: %p, nr: %ld, load: %ld\n",
+ INDENT(indent),
+ cfs_rq, cfs_rq->nr_running,
+ cfs_rq->load.weight);
+
+ __print_entity(cfs_rq->curr, "curr", indent);
+
+ for (n = cfs_rq->rb_leftmost; n; n = rb_next(n)) {
+ struct sched_entity *se = rb_entry(n, struct sched_entity, run_node);
+ __print_entity(se, "se", indent);
+ }
+}
+
+void print_runqueue(struct cfs_rq *cfs_rq)
+{
+ __print_runqueue(cfs_rq, 0);
+}
+
/**************************************************************
* Scheduling class statistics methods:
*/
@@ -1047,6 +1098,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;

+ print_runqueue(&rq->cfs);
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -1055,6 +1107,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
flags = ENQUEUE_WAKEUP;
}

+ trace_printk("add task: %p, %s/%d\n", &p->se, p->comm, p->pid);
+ print_runqueue(&rq->cfs);
+
hrtick_update(rq);
}

@@ -1068,6 +1123,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;

+ print_runqueue(&rq->cfs);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -1077,6 +1133,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
flags |= DEQUEUE_SLEEP;
}

+ trace_printk("del task: %p, %s/%d\n", &p->se, p->comm, p->pid);
+ print_runqueue(&rq->cfs);
+
hrtick_update(rq);
}

@@ -1719,6 +1778,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;

+ print_runqueue(&rq->cfs);
+
if (!cfs_rq->nr_running)
return NULL;

@@ -1731,6 +1792,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
p = task_of(se);
hrtick_start_fair(rq, p);

+ trace_printk("picked: %p, %s/%d\n", se, p->comm, p->pid);
+ print_runqueue(&rq->cfs);
+
return p;
}

@@ -1742,10 +1806,15 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;

+ print_runqueue(&rq->cfs);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
+
+ trace_printk("put: %p, %s/%d\n", se, prev->comm, prev->pid);
+ print_runqueue(&rq->cfs);
+
}

#ifdef CONFIG_SMP
@@ -3819,8 +3888,10 @@ static void set_curr_task_fair(struct rq *rq)
{
struct sched_entity *se = &rq->curr->se;

+ print_runqueue(&rq->cfs);
for_each_sched_entity(se)
set_next_entity(cfs_rq_of(se), se);
+ print_runqueue(&rq->cfs);
}

#ifdef CONFIG_FAIR_GROUP_SCHED

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/