[PATCH 10/11] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity

From: Mel Gorman
Date: Wed Feb 12 2020 - 10:46:12 EST

Next message: Mel Gorman: "[PATCH 11/11] sched/numa: Use similar logic to the load balancer for moving between overloaded domains"
Previous message: Mel Gorman: "[PATCH 09/11] sched/fair: Split out helper to adjust imbalances between domains"
In reply to: Mel Gorman: "[PATCH 09/11] sched/fair: Split out helper to adjust imbalances between domains"
Next in thread: Mel Gorman: "[PATCH 11/11] sched/numa: Use similar logic to the load balancer for moving between overloaded domains"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

The standard load balancer generally allows an imbalance to exist if
a domain has spare capacity. This patch uses similar logic within NUMA
balancing when moving a task to a preferred node. This is not a perfect
comparison with the load balancer but should be a close enough match
when the destination domain has spare capacity and the imbalance is not
too large.

Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 79 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2476ef0b056..69e41204cfae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1473,21 +1473,19 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
- return cfs_rq_runnable_load_avg(&rq->cfs);
-}
-
/* Cached statistics for all CPUs within a node */
struct numa_stats {
- unsigned long load;
+ unsigned long group_load;
+ unsigned long group_util;

/* Total compute capacity of CPUs on a node */
- unsigned long compute_capacity;
+ unsigned long group_capacity;
+
+ unsigned int sum_nr_running;

/* Details on idle CPUs */
+ unsigned int group_weight;
+ int nr_idle;
int idle_cpu;
};

@@ -1511,6 +1509,22 @@ static inline bool is_core_idle(int cpu)
/* Forward declarations of select_idle_sibling helpers */
static inline bool test_idle_cores(int cpu, bool def);

+/* Forward declarations of lb helpers */
+static unsigned long cpu_load(struct rq *rq);
+static inline unsigned long cpu_util(int cpu);
+static inline bool __lb_has_capacity(unsigned int imbalance_pct,
+ unsigned int sum_nr_running, unsigned int group_weight,
+ unsigned long group_capacity, unsigned long group_util);
+static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+
+/* NUMA Balancing equivalents for LB helpers */
+static inline bool
+numa_has_capacity(unsigned int imbalance_pct, struct numa_stats *ns)
+{
+ return __lb_has_capacity(imbalance_pct, ns->sum_nr_running + 1,
+ ns->group_weight, ns->group_capacity, ns->group_util);
+}
+
/*
* Gather all necessary information to make NUMA balancing placement
* decisions that are compatible with standard load balanced. This
@@ -1529,14 +1543,20 @@ update_numa_stats(struct numa_stats *ns, int nid,
ns->idle_cpu = -1;
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
+ unsigned int nr_running = rq->nr_running;

- ns->load += cpu_runnable_load(rq);
- ns->compute_capacity += capacity_of(cpu);
+ ns->group_load += cpu_load(rq);
+ ns->group_util += cpu_util(cpu);
+ ns->group_capacity += capacity_of(cpu);
+ ns->group_weight++;
+ ns->sum_nr_running += nr_running;

- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (!nr_running && idle_cpu(cpu)) {
int this_llc_id;

- if (READ_ONCE(rq->numa_migrate_on) ||
+ ns->nr_idle++;
+
+ if (!find_idle || READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;

@@ -1646,13 +1666,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
* ------------ vs ---------
* src_capacity dst_capacity
*/
- src_capacity = env->src_stats.compute_capacity;
- dst_capacity = env->dst_stats.compute_capacity;
+ src_capacity = env->src_stats.group_capacity;
+ dst_capacity = env->dst_stats.group_capacity;

imb = abs(dst_load * src_capacity - src_load * dst_capacity);

- orig_src_load = env->src_stats.load;
- orig_dst_load = env->dst_stats.load;
+ orig_src_load = env->src_stats.group_load;
+ orig_dst_load = env->dst_stats.group_load;

old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);

@@ -1799,8 +1819,8 @@ static void task_numa_compare(struct task_numa_env *env,
if (!load)
goto assign;

- dst_load = env->dst_stats.load + load;
- src_load = env->src_stats.load - load;
+ dst_load = env->dst_stats.group_load + load;
+ src_load = env->src_stats.group_load - load;

if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
@@ -1838,23 +1858,38 @@ static void task_numa_find_cpu(struct task_numa_env *env,
bool maymove = false;
int cpu;

- load = task_h_load(env->p);
- dst_load = env->dst_stats.load + load;
- src_load = env->src_stats.load - load;
-
/*
- * If the improvement from just moving env->p direction is better
- * than swapping tasks around, check if a move is possible.
+ * If the load balancer is unlikely to interfere with the task after
+ * a migration then use an idle CPU.
*/
- maymove = !load_too_imbalanced(src_load, dst_load, env);
+ if (env->dst_stats.idle_cpu >= 0) {
+ unsigned int imbalance;
+ int src_running, dst_running;

- /* Use an idle CPU if one has been found already */
- if (maymove && env->dst_stats.idle_cpu >= 0) {
- env->dst_cpu = env->dst_stats.idle_cpu;
- task_numa_assign(env, NULL, 0);
- return;
+ /* Would movement cause an imbalance? */
+ src_running = env->src_stats.sum_nr_running - 1;
+ dst_running = env->src_stats.sum_nr_running + 1;
+ imbalance = max(0, dst_running - src_running);
+ imbalance = adjust_numa_imbalance(imbalance, src_running);
+
+ /* Use idle CPU there is spare capacity and no imbalance */
+ if (numa_has_capacity(env->imbalance_pct, &env->dst_stats) &&
+ !imbalance) {
+ env->dst_cpu = env->dst_stats.idle_cpu;
+ task_numa_assign(env, NULL, 0);
+ return;
+ }
}

+ /*
+ * If using an idle CPU would cause an imbalance that would likely
+ * be overridden by the load balancer, consider the load instead.
+ */
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.group_load + load;
+ src_load = env->src_stats.group_load - load;
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
@@ -8048,18 +8083,27 @@ static inline int sg_imbalanced(struct sched_group *group)
* any benefit for the load balance.
*/
static inline bool
-group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
+__lb_has_capacity(unsigned int imbalance_pct, unsigned int sum_nr_running,
+ unsigned int group_weight, unsigned long group_capacity,
+ unsigned long group_util)
{
- if (sgs->sum_nr_running < sgs->group_weight)
+ if (sum_nr_running < group_weight)
return true;

- if ((sgs->group_capacity * 100) >
- (sgs->group_util * imbalance_pct))
+ if ((group_capacity * 100) >
+ (group_util * imbalance_pct))
return true;

return false;
}

+static inline bool
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
+{
+ return __lb_has_capacity(imbalance_pct, sgs->sum_nr_running,
+ sgs->group_weight, sgs->group_capacity, sgs->group_util);
+}
+
/*
* group_is_overloaded returns true if the group has more tasks than it can
* handle.
--
2.16.4

Next message: Mel Gorman: "[PATCH 11/11] sched/numa: Use similar logic to the load balancer for moving between overloaded domains"
Previous message: Mel Gorman: "[PATCH 09/11] sched/fair: Split out helper to adjust imbalances between domains"
In reply to: Mel Gorman: "[PATCH 09/11] sched/fair: Split out helper to adjust imbalances between domains"
Next in thread: Mel Gorman: "[PATCH 11/11] sched/numa: Use similar logic to the load balancer for moving between overloaded domains"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]