[PATCH v2 09/19] sched/numa: Modify migrate_swap to accept additional params

From: Srikar Dronamraju
Date: Wed Jun 20 2018 - 13:06:39 EST


There are checks in migrate_swap_stop that check if the task/cpu
combination is as per migrate_swap_arg before migrating.

However atleast one of the two tasks to be swapped by migrate_swap could
have migrated to a completely different cpu before updating the
migrate_swap_arg. The new cpu where the task is currently running could
be a different node too. If the task has migrated, numa balancer might
end up placing a task in a wrong node. Instead of achieving node
consolidation, it may end up spreading the load across nodes.

To avoid that pass the cpus as additional parameters.

While here, place migrate_swap under CONFIG_NUMA_BALANCING.

Running SPECjbb2005 on a 4 node machine and comparing bops/JVM
JVMS LAST_PATCH WITH_PATCH %CHANGE
16 25377.3 25226.6 -0.59
1 72287 73326 1.437

Acked-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Reviewed-by: Rik van Riel <riel@xxxxxxxxxxx>
Signed-off-by: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx>
---
kernel/sched/core.c | 9 ++++++---
kernel/sched/fair.c | 3 ++-
kernel/sched/sched.h | 3 ++-
3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 211890e..36f1c7c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1197,6 +1197,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}

+#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
@@ -1278,16 +1279,17 @@ static int migrate_swap_stop(void *data)
/*
* Cross migrate two tasks
*/
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
+int migrate_swap(struct task_struct *cur, struct task_struct *p,
+ int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;

arg = (struct migration_swap_arg){
.src_task = cur,
- .src_cpu = task_cpu(cur),
+ .src_cpu = curr_cpu,
.dst_task = p,
- .dst_cpu = task_cpu(p),
+ .dst_cpu = target_cpu,
};

if (arg.src_cpu == arg.dst_cpu)
@@ -1312,6 +1314,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
out:
return ret;
}
+#endif /* CONFIG_NUMA_BALANCING */

/*
* wait_task_inactive - wait for a thread to unschedule.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0580a27..0d0248b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1809,7 +1809,8 @@ static int task_numa_migrate(struct task_struct *p)
return ret;
}

- ret = migrate_swap(p, env.best_task);
+ ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
put_task_struct(env.best_task);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb467c22..52ba2d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1068,7 +1068,8 @@ enum numa_faults_stats {
};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
-extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern int migrate_swap(struct task_struct *p, struct task_struct *t,
+ int cpu, int scpu);
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_SMP
--
1.8.3.1