Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks toCPU_DEAD handling

From: Nick Piggin
Date: Mon Apr 05 2004 - 20:29:01 EST


Srivatsa Vaddagiri wrote:
On Tue, Apr 06, 2004 at 10:28:53AM +1000, Nick Piggin wrote:

First of all, if you're proposing this stuff for inclusion, you
should port it to the -mm tree, because I don't think Andrew
will want any other scheduler work going in just now. It wouldn't
be too hard.


Will send out today a patch against latest -mm tree!


I think my stuff is a bit orthogonal to what you're attempting.
And they should probably work well together. My "lazy migrate"
patch means the tasklist lock does not need to be held at all,
only the dying runqueue's lock.


Is there some place where I can download your patch (or is it in -mm tree)?



I have attached it (against 2.6.5-mm1). I haven't actually tested it
yet because I haven't got around to finding and using the i386 test
code yet.

It also contains a copule of cleanups:
rename double_lock_balance to second_rq_lock, and make migrate_all_tasks
static, and have the hotplug code call sched_cpu_stop.

Comments would be welcome.

Nick linux-2.6-npiggin/include/linux/sched.h | 4
linux-2.6-npiggin/kernel/cpu.c | 9 +
linux-2.6-npiggin/kernel/sched.c | 178 ++++++++++++++++++++------------
3 files changed, 121 insertions(+), 70 deletions(-)

diff -puN kernel/sched.c~hotplugcpu-lazy-migrate kernel/sched.c
--- linux-2.6/kernel/sched.c~hotplugcpu-lazy-migrate 2004-04-06 11:16:31.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c 2004-04-06 11:23:30.000000000 +1000
@@ -41,6 +41,7 @@
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
+#include <linux/list.h>

#ifdef CONFIG_NUMA
#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -710,6 +711,53 @@ static inline int wake_idle(int cpu, tas
}
#endif

+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * go_away: choose a new CPU for tsk if the one it is on has gone
+ * offline. Updates cpus_allowed affinity if it absolutely has to.
+ * Returns chosen destination CPU.
+ */
+static int go_away(struct task_struct *tsk)
+{
+ cpumask_t mask;
+ int cpu, node, dest_cpu;
+
+ /*
+ * watch out for per node tasks, let's stay on this node.
+ * TODO turn this into a sched_domain flag - np
+ */
+ cpu = task_cpu(tsk);
+ node = cpu_to_node(cpu);
+ mask = node_to_cpumask(node);
+
+ /*
+ * Figure out where this task should go (attempt to keep it on-node),
+ * and check if it can be migrated as-is. NOTE that kernel threads
+ * bound to more than one online cpu will be migrated.
+ */
+ cpus_and(mask, mask, tsk->cpus_allowed);
+ dest_cpu = any_online_cpu(mask);
+ if (dest_cpu == NR_CPUS)
+ dest_cpu = any_online_cpu(tsk->cpus_allowed);
+ if (dest_cpu == NR_CPUS) {
+ cpus_clear(tsk->cpus_allowed);
+ cpus_complement(tsk->cpus_allowed);
+ dest_cpu = any_online_cpu(tsk->cpus_allowed);
+
+ /*
+ * Don't tell them about moving exiting tasks or kernel
+ * threads (both mm NULL), since they never leave kernel.
+ */
+ if (tsk->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu%d\n",
+ tsk->pid, tsk->comm, task_cpu(tsk));
+ }
+ }
+ return dest_cpu;
+}
+#endif
+
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
@@ -748,11 +796,18 @@ static int try_to_wake_up(task_t * p, un
this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
+ if (unlikely(task_running(rq, p))
goto out_activate;

new_cpu = cpu;

+#ifdef CONFIG_HOTPLUG_CPU
+ if (unlikely(cpu_is_offline(cpu))) {
+ /* Must lazy-migrate off this CPU */
+ goto out_set_cpu;
+ }
+#endif
+
if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;

@@ -1257,17 +1312,17 @@ out:
#endif /* CONFIG_NUMA */

/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * second_rq_lock - lock the busiest runqueue, this_rq is locked already.
*/
-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+static void second_rq_lock(runqueue_t *locked, runqueue_t *to_lock)
{
- if (unlikely(!spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- spin_unlock(&this_rq->lock);
- spin_lock(&busiest->lock);
- spin_lock(&this_rq->lock);
+ if (unlikely(!spin_trylock(&to_lock->lock))) {
+ if (to_lock < locked) {
+ spin_unlock(&locked->lock);
+ spin_lock(&to_lock->lock);
+ spin_lock(&locked->lock);
} else
- spin_lock(&busiest->lock);
+ spin_lock(&to_lock->lock);
}
}

@@ -1592,7 +1647,7 @@ static int load_balance(int this_cpu, ru
}

/* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
+ second_rq_lock(this_rq, busiest);

nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle);
spin_unlock(&this_rq->lock);
@@ -1662,7 +1717,7 @@ static int load_balance_newidle(int this
goto out;

/* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
+ second_rq_lock(this_rq, busiest);

nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE);
@@ -1744,7 +1799,7 @@ static void active_load_balance(runqueue
}

rq = cpu_rq(push_cpu);
- double_lock_balance(busiest, rq);
+ second_rq_lock(busiest, rq);
move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
spin_unlock(&rq->lock);
next_group:
@@ -3221,6 +3276,8 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
*
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
+ *
+ * Called with this_rq locked
*/
static void __migrate_task(struct task_struct *p, int dest_cpu)
{
@@ -3228,7 +3285,7 @@ static void __migrate_task(struct task_s

rq_dest = cpu_rq(dest_cpu);

- double_rq_lock(this_rq(), rq_dest);
+ second_rq_lock(this_rq(), rq_dest);
/* Already moved. */
if (task_cpu(p) != smp_processor_id())
goto out;
@@ -3246,7 +3303,7 @@ static void __migrate_task(struct task_s
p->timestamp = rq_dest->timestamp_last_tick;

out:
- double_rq_unlock(this_rq(), rq_dest);
+ spin_unlock(&rq_dest->lock);
}

/*
@@ -3286,8 +3343,6 @@ static int migration_thread(void * data)
req = list_entry(head->next, migration_req_t, list);
list_del_init(head->next);

- spin_unlock(&rq->lock);
-
if (req->type == REQ_MOVE_TASK) {
__migrate_task(req->task, req->dest_cpu);
} else if (req->type == REQ_SET_DOMAIN) {
@@ -3296,7 +3351,7 @@ static int migration_thread(void * data)
WARN_ON(1);
}

- local_irq_enable();
+ spin_unlock_irq(&rq->lock);

complete(&req->done);
}
@@ -3304,60 +3359,53 @@ static int migration_thread(void * data)
}

#ifdef CONFIG_HOTPLUG_CPU
-/* migrate_all_tasks - function to migrate all the tasks from the
- * current cpu caller must have already scheduled this to the target
- * cpu via set_cpus_allowed. Machine is stopped. */
-void migrate_all_tasks(void)
-{
- struct task_struct *tsk, *t;
- int dest_cpu, src_cpu;
- unsigned int node;
-
- /* We're nailed to this CPU. */
- src_cpu = smp_processor_id();
-
- /* Not required, but here for neatness. */
- write_lock(&tasklist_lock);
-
- /* watch out for per node tasks, let's stay on this node */
- node = cpu_to_node(src_cpu);
-
- do_each_thread(t, tsk) {
- cpumask_t mask;
- if (tsk == current)
- continue;
+/*
+ * migrate_all_tasks - function to migrate all the tasks from the
+ * current CPU. Current CPU must be marked offline.
+ */
+static void migrate_all_tasks(void)
+{
+ runqueue_t *rq;
+ int i, j;
+ int dest_cpu;

- if (task_cpu(tsk) != src_cpu)
- continue;
+ rq = this_rq_lock();

- /* Figure out where this task should go (attempting to
- * keep it on-node), and check if it can be migrated
- * as-is. NOTE that kernel threads bound to more than
- * one online cpu will be migrated. */
- mask = node_to_cpumask(node);
- cpus_and(mask, mask, tsk->cpus_allowed);
- dest_cpu = any_online_cpu(mask);
- if (dest_cpu == NR_CPUS)
- dest_cpu = any_online_cpu(tsk->cpus_allowed);
- if (dest_cpu == NR_CPUS) {
- cpus_clear(tsk->cpus_allowed);
- cpus_complement(tsk->cpus_allowed);
- dest_cpu = any_online_cpu(tsk->cpus_allowed);
-
- /* Don't tell them about moving exiting tasks
- or kernel threads (both mm NULL), since
- they never leave kernel. */
- if (tsk->mm && printk_ratelimit())
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- tsk->pid, tsk->comm, src_cpu);
+again:
+ /* Traverse the runqueue */
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < MAX_PRIO; j++) {
+ struct task_struct *tsk, *tmp;
+ list_for_each_entry_safe(tsk, tmp,
+ &rq->arrays[i].queue[j], run_list) {
+ if (tsk == current)
+ continue;
+
+ dest_cpu = go_away(tsk);
+ __migrate_task(tsk, dest_cpu);
+ }
}
+ }

- __migrate_task(tsk, dest_cpu);
- } while_each_thread(t, tsk);
+ /* __migrate_task can drop the lock (via second_rq_lock).
+ * Recheck and go again if we're not the only ones left. */
+ if (rq->nr_running > 1)
+ goto again;

- write_unlock(&tasklist_lock);
+ rq_unlock(rq);
}
+
+/*
+ * sched_cpu_stop is called by CPU hotplug code when it intends to take
+ * the current CPU down. It must be called after the CPU has been marked
+ * offline.
+ */
+void sched_cpu_stop(void)
+{
+ /* At the moment all we need to do is migrate tasks off */
+ migrate_all_tasks();
+}
+
#endif /* CONFIG_HOTPLUG_CPU */

/*
diff -puN kernel/cpu.c~hotplugcpu-lazy-migrate kernel/cpu.c
--- linux-2.6/kernel/cpu.c~hotplugcpu-lazy-migrate 2004-04-06 11:16:31.000000000 +1000
+++ linux-2.6-npiggin/kernel/cpu.c 2004-04-06 11:16:31.000000000 +1000
@@ -91,13 +91,16 @@ static int take_cpu_down(void *unused)
/* Take offline: makes arch_cpu_down somewhat easier. */
cpu_clear(smp_processor_id(), cpu_online_map);

+ /* Ensure all other CPUs see that we're offline */
+ wmb();
+
+ /* Everyone else gets kicked off. */
+ sched_cpu_stop();
+
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
- else
- /* Everyone else gets kicked off. */
- migrate_all_tasks();

return err;
}
diff -puN include/linux/sched.h~hotplugcpu-lazy-migrate include/linux/sched.h
--- linux-2.6/include/linux/sched.h~hotplugcpu-lazy-migrate 2004-04-06 11:16:31.000000000 +1000
+++ linux-2.6-npiggin/include/linux/sched.h 2004-04-06 11:16:31.000000000 +1000
@@ -664,8 +664,8 @@ extern void sched_balance_exec(void);
#define sched_balance_exec() {}
#endif

-/* Move tasks off this (offline) CPU onto another. */
-extern void migrate_all_tasks(void);
+/* sched_cpu_stop must be called after the CPU is marked offline */
+extern void sched_cpu_stop(void);
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p);
extern int task_nice(task_t *p);

_