[PATCH 40/52] sched: Add "task flipping" support

From: Ingo Molnar
Date: Sun Dec 02 2012 - 13:48:22 EST


NUMA balancing will make use of the new sched_rebalance_to() mode:
the ability to 'flip' two tasks.

When two tasks have a similar weight but one of them executes on
the wrong CPU or node, then it's beneficial to do a quick flipping
operation. This will not change the general load of the source
and the target CPUs, so it won't disturb the scheduling balance.

With this we can do NUMA placement while the system is otherwise
in equilibrium.

The code has to be careful about races and whether the source and
target CPUs are allowed for the tasks in question.

This method is also faster: it can execute two migrations via one
migration-thread call in essence - instead of two such calls. The
thread on the target CPU acts as the 'migration thread' for the
replaced task.

Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/sched.h | 1 -
kernel/sched/core.c | 68 +++++++++++++++++++++++++++++++++++++--------------
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 6 +++++
4 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bc3a03..696492e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2020,7 +2020,6 @@ task_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
-extern void sched_rebalance_to(int dest_cpu);
#else
#define sched_exec() {}
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 93f2561..cad6c89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -963,8 +963,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
}

struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
};

static int migration_cpu_stop(void *data);
@@ -2596,22 +2596,6 @@ unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

-/*
- * sched_rebalance_to()
- *
- * Active load-balance to a target CPU.
- */
-void sched_rebalance_to(int dest_cpu)
-{
- struct task_struct *p = current;
- struct migration_arg arg = { p, dest_cpu };
-
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return;
-
- stop_one_cpu(raw_smp_processor_id(), migration_cpu_stop, &arg);
-}
-
#endif

DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -4778,6 +4762,54 @@ fail:
}

/*
+ * sched_rebalance_to()
+ *
+ * Active load-balance to a target CPU.
+ */
+void sched_rebalance_to(int dst_cpu, int flip_tasks)
+{
+ struct task_struct *p_src = current;
+ struct task_struct *p_dst;
+ int src_cpu = raw_smp_processor_id();
+ struct migration_arg arg = { p_src, dst_cpu };
+ struct rq *dst_rq;
+
+ if (!cpumask_test_cpu(dst_cpu, tsk_cpus_allowed(p_src)))
+ return;
+
+ if (flip_tasks) {
+ dst_rq = cpu_rq(dst_cpu);
+
+ local_irq_disable();
+ raw_spin_lock(&dst_rq->lock);
+
+ p_dst = dst_rq->curr;
+ get_task_struct(p_dst);
+
+ raw_spin_unlock(&dst_rq->lock);
+ local_irq_enable();
+ }
+
+ stop_one_cpu(src_cpu, migration_cpu_stop, &arg);
+ /*
+ * Task-flipping.
+ *
+ * We are now on the new CPU - check whether we can migrate
+ * the task we just preempted, to where we came from:
+ */
+ if (flip_tasks) {
+ local_irq_disable();
+ if (raw_smp_processor_id() == dst_cpu) {
+ /* Note that the arguments flip: */
+ __migrate_task(p_dst, dst_cpu, src_cpu);
+ }
+ local_irq_enable();
+
+ put_task_struct(p_dst);
+ }
+}
+
+/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
* 'pushing' onto another runqueue.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f306c..f0d3876 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1176,7 +1176,7 @@ static void task_numa_placement(struct task_struct *p)
struct rq *rq = cpu_rq(p->ideal_cpu);

rq->curr_buddy = p;
- sched_rebalance_to(p->ideal_cpu);
+ sched_rebalance_to(p->ideal_cpu, 0);
}
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 810a1a0..c4d15fd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1260,3 +1260,9 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_SMP
+extern void sched_rebalance_to(int dest_cpu, int flip_tasks);
+#else
+static inline void sched_rebalance_to(int dest_cpu, int flip_tasks) { }
+#endif
--
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/