[PATCH 07/11] sched/numa: Find an alternative idle CPU if the CPU is part of an active NUMA balance

From: Mel Gorman
Date: Wed Feb 12 2020 - 04:37:11 EST


Multiple tasks can attempt to select and idle CPU but fail because
numa_migrate_on is already set and the migration fails. Instead of failing,
scan for an alternative idle CPU. select_idle_sibling is not used because
it requires IRQs to be disabled and it ignores numa_migrate_on allowing
multiple tasks to stack. This scan may still fail if there are idle
candidate CPUs due to races but if this occurs, it's best that a task
stay on an available CPU that move to a contended one.

Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 40 ++++++++++++++++++++++------------------
1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d2a58b19430e..3f518b0d9261 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1590,15 +1590,34 @@ static void task_numa_assign(struct task_numa_env *env,
{
struct rq *rq = cpu_rq(env->dst_cpu);

- /* Bail out if run-queue part of active NUMA balance. */
- if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1))
+ /* Check if run-queue part of active NUMA balance. */
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
+ int cpu;
+ int start = env->dst_cpu;
+
+ /* Find alternative idle CPU. */
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+ continue;
+ }
+
+ env->dst_cpu = cpu;
+ rq = cpu_rq(env->dst_cpu);
+ if (!xchg(&rq->numa_migrate_on, 1))
+ goto assign;
+ }
+
+ /* Failed to find an alternative idle CPU */
return;
+ }

+assign:
/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
- if (env->best_cpu != -1) {
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
@@ -1772,21 +1791,6 @@ static void task_numa_compare(struct task_numa_env *env,
cpu = env->best_cpu;
}

- /*
- * Use select_idle_sibling if the previously found idle CPU is
- * not idle any more.
- */
- if (!idle_cpu(cpu)) {
- /*
- * select_idle_siblings() uses an per-CPU cpumask that
- * can be used from IRQ context.
- */
- local_irq_disable();
- cpu = select_idle_sibling(env->p, env->src_cpu,
- env->dst_cpu);
- local_irq_enable();
- }
-
env->dst_cpu = cpu;
}

--
2.16.4