[patch] sched-smt-fixes.patch, 2.6.8.1-mm2

From: Ingo Molnar
Date: Fri Aug 20 2004 - 07:28:21 EST

Next message: Alan Cox: "Re: PATCH: cdrecord: avoiding scsi device numbering for ide devices"
Previous message: Alan Cox: "Re: PATCH: cdrecord: avoiding scsi device numbering for ide devices"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

while looking at HT scheduler bugreports and boot failures i discovered
a bad assumption in most of the HT scheduling code: that resched_task()
can be called without holding the task's runqueue.

This is most definitely not valid - doing it without locking can lead to
the task on that CPU exiting, and this CPU corrupting the (ex-)
task_info struct. It can also lead to HT-wakeup races with task
switching on that other CPU. (this_CPU marking the wrong task on
that_CPU as need_resched - resulting in e.g. idle wakeups not working.)

The attached patch against 2.6.8.1-mm2 fixes it all up. Changes:

- resched_task() needs to touch the task so the runqueue lock of that CPU
must be held: resched_task() now enforces this rule.

- wake_priority_sleeper() was called without holding the runqueue lock.

- wake_sleeping_dependent() needs to hold the runqueue locks of all siblings
(2 typically). Effects of this ripples back to schedule() as well - in the
non-SMT case it gets compiled out so it's fine.

- dependent_sleeper() needs the runqueue locks too - and it's slightly harder
because it wants to know the 'next task' info which might change during
the lock-drop/reacquire. Ripple effect on schedule() => compiled out on
non-SMT so fine.

- resched_task() was disabling preemption for no good reason - all paths
that called this function had either a spinlock held or irqs disabled.

Compiled & booted on x86 SMP and UP, with and without SMT. Booted the
SMT kernel on a real SMP+HT box as well. (Unpatched kernel wouldnt even
boot with the resched_task() assert in place.)

Ingo

locking in the SMT scheduler was completely wrong:

- resched_task() needs to touch the task so the runqueue lock of that CPU
must be held. resched_task() now enforces this rule.

- resched_task() was disabling preemption for no good reason - all paths
that called this function had either a spinlock held or irqs disabled.

- wake_priority_sleeper() was called without holding the runqueue lock.

- wake_sleeping_dependent() needs to hold the runqueue locks of all siblings
(2 typically). Effects of this ripples back to schedule() as well - in the
non-SMT case it gets compiled out so it's fine.

- dependent_sleeper() needs the runqueue locks too - and it's slightly harder
because it wants to know the 'next task' info which might change during
the lock-drop/reacquire. Ripple effect on schedule() => compiled out on
non-SMT so fine.

Compiled & booted on x86 SMP and UP, with and without SMT. Booted the SMT
kernel on a real SMP+HT box as well.

Signed-off-by: Ingo Molnar <mingo@xxxxxxx>

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -919,7 +919,8 @@ static void resched_task(task_t *p)
{
int need_resched, nrpolling;

- preempt_disable();
+ BUG_ON(!spin_is_locked(&task_rq(p)->lock));
+
/* minimise the chance of sending an interrupt to poll_idle() */
nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
@@ -927,7 +928,6 @@ static void resched_task(task_t *p)

if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
smp_send_reschedule(task_cpu(p));
- preempt_enable();
}
#else
static inline void resched_task(task_t *p)
@@ -2407,8 +2407,10 @@ void scheduler_tick(int user_ticks, int
cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
+ spin_lock(&rq->lock);
if (wake_priority_sleeper(rq))
- goto out;
+ goto out_unlock;
+ spin_unlock(&rq->lock);
rebalance_tick(cpu, rq, IDLE);
return;
}
@@ -2497,23 +2499,34 @@ out:
}

#ifdef CONFIG_SCHED_SMT
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
- int i;
- struct sched_domain *sd = rq->sd;
+ struct sched_domain *sd = this_rq->sd;
cpumask_t sibling_map;
+ int i;

if (!(sd->flags & SD_SHARE_CPUPOWER))
return;

+ /*
+ * Unlock the current runqueue because we have to lock in
+ * CPU order to avoid deadlocks. Caller knows that we might
+ * unlock. We keep IRQs disabled.
+ */
+ spin_unlock(&this_rq->lock);
+
cpus_and(sibling_map, sd->span, cpu_online_map);
- for_each_cpu_mask(i, sibling_map) {
- runqueue_t *smt_rq;

- if (i == cpu)
- continue;
+ for_each_cpu_mask(i, sibling_map)
+ spin_lock(&cpu_rq(i)->lock);
+ /*
+ * We clear this CPU from the mask. This both simplifies the
+ * inner loop and keps this_rq locked when we exit:
+ */
+ cpu_clear(this_cpu, sibling_map);

- smt_rq = cpu_rq(i);
+ for_each_cpu_mask(i, sibling_map) {
+ runqueue_t *smt_rq = cpu_rq(i);

/*
* If an SMT sibling task is sleeping due to priority
@@ -2522,27 +2535,53 @@ static inline void wake_sleeping_depende
if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
resched_task(smt_rq->idle);
}
+
+ for_each_cpu_mask(i, sibling_map)
+ spin_unlock(&cpu_rq(i)->lock);
+ /*
+ * We exit with this_cpu's rq still held and IRQs
+ * still disabled:
+ */
}

-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = rq->sd;
+ struct sched_domain *sd = this_rq->sd;
cpumask_t sibling_map;
+ prio_array_t *array;
int ret = 0, i;
+ task_t *p;

if (!(sd->flags & SD_SHARE_CPUPOWER))
return 0;

+ /*
+ * The same locking rules and details apply as for
+ * wake_sleeping_dependent():
+ */
+ spin_unlock(&this_rq->lock);
cpus_and(sibling_map, sd->span, cpu_online_map);
- for_each_cpu_mask(i, sibling_map) {
- runqueue_t *smt_rq;
- task_t *smt_curr;
+ for_each_cpu_mask(i, sibling_map)
+ spin_lock(&cpu_rq(i)->lock);
+ cpu_clear(this_cpu, sibling_map);

- if (i == cpu)
- continue;
+ /*
+ * Establish next task to be run - it might have gone away because
+ * we released the runqueue lock above:
+ */
+ if (!this_rq->nr_running)
+ goto out_unlock;
+ array = this_rq->active;
+ if (!array->nr_active)
+ array = this_rq->expired;
+ BUG_ON(!array->nr_active);

- smt_rq = cpu_rq(i);
- smt_curr = smt_rq->curr;
+ p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
+ task_t, run_list);
+
+ for_each_cpu_mask(i, sibling_map) {
+ runqueue_t *smt_rq = cpu_rq(i);
+ task_t *smt_curr = smt_rq->curr;

/*
* If a user task with lower static priority than the
@@ -2568,14 +2607,17 @@ static inline int dependent_sleeper(int
(smt_curr == smt_rq->idle && smt_rq->nr_running))
resched_task(smt_curr);
}
+out_unlock:
+ for_each_cpu_mask(i, sibling_map)
+ spin_unlock(&cpu_rq(i)->lock);
return ret;
}
#else
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
}

-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
return 0;
}
@@ -2656,13 +2698,33 @@ need_resched:

cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
+go_idle:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu, rq);
+ /*
+ * wake_sleeping_dependent() might have released
+ * the runqueue, so break out if we got new
+ * tasks meanwhile:
+ */
+ if (!rq->nr_running)
+ goto switch_tasks;
+ }
+ } else {
+ if (dependent_sleeper(cpu, rq)) {
+ schedstat_inc(rq, sched_goidle);
+ next = rq->idle;
goto switch_tasks;
}
+ /*
+ * dependent_sleeper() releases and reacquires the runqueue
+ * lock, hence go into the idle loop if the rq went
+ * empty meanwhile:
+ */
+ if (unlikely(!rq->nr_running))
+ goto go_idle;
}

array = rq->active;
@@ -2683,12 +2745,6 @@ need_resched:
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);

- if (dependent_sleeper(cpu, rq, next)) {
- schedstat_inc(rq, sched_goidle);
- next = rq->idle;
- goto switch_tasks;
- }
-
if (!rt_task(next) && next->activated > 0) {
unsigned long long delta = now - next->timestamp;

Next message: Alan Cox: "Re: PATCH: cdrecord: avoiding scsi device numbering for ide devices"
Previous message: Alan Cox: "Re: PATCH: cdrecord: avoiding scsi device numbering for ide devices"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]