Re: [PATCH v2 4/8] sched/fair: rework load_balance

From: Valentin Schneider
Date: Tue Aug 06 2019 - 13:17:39 EST


Second batch, get it while it's hot...

On 01/08/2019 15:40, Vincent Guittot wrote:
[...]
> @@ -7438,19 +7453,53 @@ static int detach_tasks(struct lb_env *env)
> if (!can_migrate_task(p, env))
> goto next;
>
> - load = task_h_load(p);
> + switch (env->balance_type) {
> + case migrate_task:
> + /* Migrate task */
> + env->imbalance--;
> + break;
>
> - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
> - goto next;
> + case migrate_util:
> + util = task_util_est(p);
>
> - if ((load / 2) > env->imbalance)
> - goto next;
> + if (util > env->imbalance)
> + goto next;
> +
> + env->imbalance -= util;
> + break;
> +
> + case migrate_load:
> + load = task_h_load(p);
> +
> + if (sched_feat(LB_MIN) &&
> + load < 16 && !env->sd->nr_balance_failed)
> + goto next;
> +
> + if ((load / 2) > env->imbalance)
> + goto next;
> +
> + env->imbalance -= load;
> + break;
> +
> + case migrate_misfit:
> + load = task_h_load(p);
> +
> + /*
> + * utilization of misfit task might decrease a bit
> + * since it has been recorded. Be conservative in the
> + * condition.
> + */
> + if (load < env->imbalance)
> + goto next;
> +
> + env->imbalance = 0;
> + break;
> + }
>
> detach_task(p, env);
> list_add(&p->se.group_node, &env->tasks);
>
> detached++;
> - env->imbalance -= load;
>

It's missing something like this:

-----8<-----
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b0631ff2a4bd..055563e19090 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7506,7 +7506,7 @@ static int detach_tasks(struct lb_env *env)

/*
* We only want to steal up to the prescribed amount of
- * runnable load.
+ * load/util/tasks
*/
if (env->imbalance <= 0)
break;
----->8-----

[...]
> @@ -8013,19 +8059,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
> }
>
> static inline enum
> -group_type group_classify(struct sched_group *group,
> +group_type group_classify(struct lb_env *env,
> + struct sched_group *group,
> struct sg_lb_stats *sgs)
> {
> - if (sgs->group_no_capacity)
> + if (group_is_overloaded(env, sgs))
> return group_overloaded;
>
> if (sg_imbalanced(group))
> return group_imbalanced;
>
> + if (sgs->group_asym_capacity)
> + return group_asym_capacity;
> +
> if (sgs->group_misfit_task_load)
> return group_misfit_task;
>
> - return group_other;
> + if (!group_has_capacity(env, sgs))
> + return group_fully_busy;

If I got my booleans right, reaching group_fully_busy means
!group_is_overloaded() && !group_has_capacity(), which gives us:

- If nr_running > group_weight, then we had to have
sgs->group_capacity * 100 == sgs->group_util * env->sd->imbalance_pct

- If nr_running == group_weight, then we had to have
sgs->group_capacity * 100 <= sgs->group_util * env->sd->imbalance_pct

- (if nr_running < group_weight we go to group_has_spare)

That first equality seems somewhat odd considering how rarely it will
occur, but then we still want the util check to fall down to
group_has_spare when

nr_running >= group_weight &&
sgs->group_capacity * 100 > sgs->group_util * env->sd->imbalance_pct

Maybe we could change group_has_capacity()'s util comparison from '>' to
'>=' to avoid this weird "artefact".

> +
> + return group_has_spare;
> }
>
> static bool update_nohz_stats(struct rq *rq, bool force)

[...]
> @@ -8147,11 +8223,67 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> if (sgs->group_type < busiest->group_type)
> return false;
>
> - if (sgs->avg_load <= busiest->avg_load)
> + /*
> + * The candidate and the current busiest group are the same type of
> + * group. Let check which one is the busiest according to the type.
> + */
> +
> + switch (sgs->group_type) {
> + case group_overloaded:
> + /* Select the overloaded group with highest avg_load. */
> + if (sgs->avg_load <= busiest->avg_load)
> + return false;
> + break;
> +
> + case group_imbalanced:
> + /* Select the 1st imbalanced group as we don't have
> + * any way to choose one more than another
> + */
> return false;
> + break;
>
> - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
> - goto asym_packing;
> + case group_asym_capacity:
> + /* Prefer to move from lowest priority CPU's work */
> + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
> + return false;
> + break;
> +
> + case group_misfit_task:
> + /*
> + * If we have more than one misfit sg go with the
> + * biggest misfit.
> + */
> + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
> + return false;
> + break;
> +
> + case group_fully_busy:
> + /*
> + * Select the fully busy group with highest avg_load.
> + * In theory, there is no need to pull task from such
> + * kind of group because tasks have all compute
> + * capacity that they need but we can still improve the
> + * overall throughput by reducing contention when
> + * accessing shared HW resources.
> + * XXX for now avg_load is not computed and always 0 so
> + * we select the 1st one.
> + */

What's wrong with unconditionally computing avg_load in update_sg_lb_stats()?
We already unconditionally accumulate group_load anyway.

If it's to make way for patch 6/8 (using load instead of runnable load),
then I think you are doing things in the wrong order. IMO in this patch we
should unconditionally compute avg_load (using runnable load), and then
you could tweak it up in a subsequent patch.

> + if (sgs->avg_load <= busiest->avg_load)
> + return false;
> + break;
> +
> + case group_has_spare:
> + /*
> + * Select not overloaded group with lowest number of
> + * idle cpus. We could also compare the spare capacity
> + * which is more stable but it can end up that the
> + * group has less spare capacity but finally more idle
> + * cpus which means less opportunity to pull tasks.
> + */
> + if (sgs->idle_cpus >= busiest->idle_cpus)
> + return false;
> + break;
> + }
>
> /*
> * Candidate sg has no more than one task per CPU and

[...]
> @@ -8341,69 +8421,132 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> */
> static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
> {
> - unsigned long max_pull, load_above_capacity = ~0UL;
> struct sg_lb_stats *local, *busiest;
>
> local = &sds->local_stat;
> busiest = &sds->busiest_stat;
> + if (busiest->group_type == group_imbalanced) {
> + /*
> + * In the group_imb case we cannot rely on group-wide averages
> + * to ensure CPU-load equilibrium, try to move any task to fix
> + * the imbalance. The next load balance will take care of
> + * balancing back the system.
> + */
> + env->balance_type = migrate_task;
> + env->imbalance = 1;
> + return;
> + }
>
> - if (busiest->group_asym_capacity) {
> + if (busiest->group_type == group_asym_capacity) {
> + /*
> + * In case of asym capacity, we will try to migrate all load
> + * to the preferred CPU
> + */
> + env->balance_type = migrate_load;
> env->imbalance = busiest->group_load;
> return;
> }
>
> + if (busiest->group_type == group_misfit_task) {
> + /* Set imbalance to allow misfit task to be balanced. */
> + env->balance_type = migrate_misfit;
> + env->imbalance = busiest->group_misfit_task_load;

AFAICT we don't ever use this value, other than setting it to 0 in
detach_tasks(), so what we actually set it to doesn't matter (as long as
it's > 0).

I'd re-suggest folding migrate_misfit into migrate_task, which is doable if
we re-instore lb_env.src_grp_type (or rather, not delete it in this patch),
though it makes some other places somewhat uglier. The good thing is that
it actually does end up being implemented as a special kind of task
migration, rather than being loosely related.

Below's a diff, only compile-tested but should help show my point.
-----8<-----
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8681c32445b..b0631ff2a4bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7155,7 +7155,6 @@ enum migration_type {
migrate_task = 0,
migrate_util,
migrate_load,
- migrate_misfit,
};

#define LBF_ALL_PINNED 0x01
@@ -7188,6 +7187,7 @@ struct lb_env {
unsigned int loop_max;

enum fbq_type fbq_type;
+ enum group_type src_grp_type;
enum migration_type balance_type;
struct list_head tasks;
};
@@ -7455,6 +7455,13 @@ static int detach_tasks(struct lb_env *env)

switch (env->balance_type) {
case migrate_task:
+ /* Check for special kinds of tasks */
+ if (env->src_grp_type == group_misfit_task) {
+ /* This isn't the misfit task */
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ goto next;
+ }
+
/* Migrate task */
env->imbalance--;
break;
@@ -7480,20 +7487,6 @@ static int detach_tasks(struct lb_env *env)

env->imbalance -= load;
break;
-
- case migrate_misfit:
- load = task_h_load(p);
-
- /*
- * utilization of misfit task might decrease a bit
- * since it has been recorded. Be conservative in the
- * condition.
- */
- if (load < env->imbalance)
- goto next;
-
- env->imbalance = 0;
- break;
}

detach_task(p, env);
@@ -8449,8 +8442,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

if (busiest->group_type == group_misfit_task) {
/* Set imbalance to allow misfit task to be balanced. */
- env->balance_type = migrate_misfit;
- env->imbalance = busiest->group_misfit_task_load;
+ env->balance_type = migrate_task;
+ env->imbalance = 1;
return;
}

@@ -8661,8 +8654,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)

force_balance:
/* Looks like there is an imbalance. Compute it */
+ env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
-
return env->imbalance ? sds.busiest : NULL;

out_balanced:
@@ -8728,7 +8721,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,

switch (env->balance_type) {
case migrate_task:
- if (busiest_nr < nr_running) {
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+ * seek the "biggest" misfit task.
+ */
+ if (env->src_grp_type == group_misfit_task &&
+ rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ } else if (busiest_nr < nr_running) {
busiest_nr = nr_running;
busiest = rq;
}
@@ -8771,19 +8772,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
busiest = rq;
}
break;
-
- case migrate_misfit:
- /*
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
- * seek the "biggest" misfit task.
- */
- if (rq->misfit_task_load > busiest_load) {
- busiest_load = rq->misfit_task_load;
- busiest = rq;
- }
-
- break;
-
}
}

@@ -8829,7 +8817,7 @@ voluntary_active_balance(struct lb_env *env)
return 1;
}

- if (env->balance_type == migrate_misfit)
+ if (env->src_grp_type == group_misfit_task)
return 1;

return 0;
----->8-----


> + return;
> + }
> +

Also, I think these three busiest->group_type conditions could be turned
into a switch case.

> /*
> - * Avg load of busiest sg can be less and avg load of local sg can
> - * be greater than avg load across all sgs of sd because avg load
> - * factors in sg capacity and sgs with smaller group_type are
> - * skipped when updating the busiest sg:
> + * Try to use spare capacity of local group without overloading it or
> + * emptying busiest
> */
> - if (busiest->group_type != group_misfit_task &&
> - (busiest->avg_load <= sds->avg_load ||
> - local->avg_load >= sds->avg_load)) {
> - env->imbalance = 0;
> + if (local->group_type == group_has_spare) {
> + if (busiest->group_type > group_fully_busy) {
> + /*
> + * If busiest is overloaded, try to fill spare
> + * capacity. This might end up creating spare capacity
> + * in busiest or busiest still being overloaded but
> + * there is no simple way to directly compute the
> + * amount of load to migrate in order to balance the
> + * system.
> + */
> + env->balance_type = migrate_util;
> + env->imbalance = max(local->group_capacity, local->group_util) -
> + local->group_util;
> + return;
> + }
> +
> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> + /*
> + * When prefer sibling, evenly spread running tasks on
> + * groups.
> + */
> + env->balance_type = migrate_task;
> + env->imbalance = (busiest->sum_h_nr_running - local->sum_h_nr_running) >> 1;
> + return;
> + }
> +
> + /*
> + * If there is no overload, we just want to even the number of
> + * idle cpus.
> + */
> + env->balance_type = migrate_task;
> + env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);
> return;
> }
>
> /*
> - * If there aren't any idle CPUs, avoid creating some.
> + * Local is fully busy but have to take more load to relieve the
> + * busiest group
> */
> - if (busiest->group_type == group_overloaded &&
> - local->group_type == group_overloaded) {
> - load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;
> - if (load_above_capacity > busiest->group_capacity) {
> - load_above_capacity -= busiest->group_capacity;
> - load_above_capacity *= scale_load_down(NICE_0_LOAD);
> - load_above_capacity /= busiest->group_capacity;
> - } else
> - load_above_capacity = ~0UL;
> + if (local->group_type < group_overloaded) {
> + /*
> + * Local will become overvloaded so the avg_load metrics are
^^^^^^^^^^^
s/overvloaded/overloaded/

> + * finally needed
> + */
> +
> + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
> + local->group_capacity;
> +
> + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
> + sds->total_capacity;
> }
>
> /*
> - * We're trying to get all the CPUs to the average_load, so we don't
> - * want to push ourselves above the average load, nor do we wish to
> - * reduce the max loaded CPU below the average load. At the same time,
> - * we also don't want to reduce the group load below the group
> - * capacity. Thus we look for the minimum possible imbalance.
> + * Both group are or will become overloaded and we're trying to get
> + * all the CPUs to the average_load, so we don't want to push
> + * ourselves above the average load, nor do we wish to reduce the
> + * max loaded CPU below the average load. At the same time, we also
> + * don't want to reduce the group load below the group capacity.
> + * Thus we look for the minimum possible imbalance.
> */
> - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
> -
> - /* How much load to actually move to equalise the imbalance */
> + env->balance_type = migrate_load;
> env->imbalance = min(
> - max_pull * busiest->group_capacity,
> + (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
> (sds->avg_load - local->avg_load) * local->group_capacity
> ) / SCHED_CAPACITY_SCALE;
> -
> - /* Boost imbalance to allow misfit task to be balanced. */
> - if (busiest->group_type == group_misfit_task) {
> - env->imbalance = max_t(long, env->imbalance,
> - busiest->group_misfit_task_load);
> - }
> -
> }
>
> /******* find_busiest_group() helpers end here *********************/
>
> +/*
> + * Decision matrix according to the local and busiest group state
> + *
> + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
> + * has_spare nr_idle balanced N/A N/A balanced balanced
> + * fully_busy nr_idle nr_idle N/A N/A balanced balanced
> + * misfit_task force N/A N/A N/A force force
> + * asym_capacity force force N/A N/A force force
> + * imbalanced force force N/A N/A force force
> + * overloaded force force N/A N/A force avg_load
> + *
> + * N/A : Not Applicable because already filtered while updating
> + * statistics.
> + * balanced : The system is balanced for these 2 groups.
> + * force : Calculate the imbalance as load migration is probably needed.
> + * avg_load : Only if imbalance is significant enough.
> + * nr_idle : dst_cpu is not busy and the number of idle cpus is quite
> + * different in groups.
> + */
> +
> /**
> * find_busiest_group - Returns the busiest group within the sched_domain
> * if there is an imbalance.

[...]
> @@ -8459,59 +8602,71 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> goto force_balance;
>

As for their counterpart in calculate_imbalance(), maybe go for a switch?

Also, it would be nice if the ordering of these matched the one in
calculate_imbalance() - right now it's the exact reverse order.

Finally, would it hurt much to just move the balance_type and imbalance
computation for these group types here? It breaks the nice partitions
you've set up between find_busiest_group() and calculate_imbalance(),
but it leads to less redirections for what in the end is just a simple
condition.

> /*
> - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
> - * capacities from resulting in underutilization due to avg_load.
> - */
> - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
> - busiest->group_no_capacity)
> - goto force_balance;
> -
> - /* Misfit tasks should be dealt with regardless of the avg load */
> - if (busiest->group_type == group_misfit_task)
> - goto force_balance;
> -
> - /*
> * If the local group is busier than the selected busiest group
> * don't try and pull any tasks.
> */
> - if (local->avg_load >= busiest->avg_load)
> + if (local->group_type > busiest->group_type)
> goto out_balanced;
>
> /*
> - * Don't pull any tasks if this group is already above the domain
> - * average load.
> + * When groups are overloaded, use the avg_load to ensure fairness
> + * between tasks.
> */
> - if (local->avg_load >= sds.avg_load)
> - goto out_balanced;
> + if (local->group_type == group_overloaded) {
> + /*
> + * If the local group is more loaded than the selected
> + * busiest group don't try and pull any tasks.
> + */
> + if (local->avg_load >= busiest->avg_load)
> + goto out_balanced;
> +
> + /* XXX broken for overlapping NUMA groups */
> + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
> + sds.total_capacity;
>
> - if (env->idle == CPU_IDLE) {
> /*
> - * This CPU is idle. If the busiest group is not overloaded
> - * and there is no imbalance between this and busiest group
> - * wrt idle CPUs, it is balanced. The imbalance becomes
> - * significant if the diff is greater than 1 otherwise we
> - * might end up to just move the imbalance on another group
> + * Don't pull any tasks if this group is already above the
> + * domain average load.
> */
> - if ((busiest->group_type != group_overloaded) &&
> - (local->idle_cpus <= (busiest->idle_cpus + 1)))
> + if (local->avg_load >= sds.avg_load)
> goto out_balanced;
> - } else {
> +
> /*
> - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
> - * imbalance_pct to be conservative.
> + * If the busiest group is more loaded, use imbalance_pct to be
> + * conservative.
> */
> if (100 * busiest->avg_load <=
> env->sd->imbalance_pct * local->avg_load)
> goto out_balanced;
> +
> }
>
> + /* Try to move all excess tasks to child's sibling domain */
> + if (sds.prefer_sibling && local->group_type == group_has_spare &&
> + busiest->sum_h_nr_running > local->sum_h_nr_running + 1)
> + goto force_balance;
> +
> + if (busiest->group_type != group_overloaded &&
> + (env->idle == CPU_NOT_IDLE ||

Disregarding the idle_cpus count, we never balance load when the busiest
is < group_misfit_task and we're CPU_NOT_IDLE.

I *think* that's okay, since AFAICT that should mean

(local) nr_running < group_weight
(busiest) nr_running <= group_weight
(or that weird == case)

and if we (local) are not idle then we shouldn't pull anything. Bleh, guess
it made me scratch my head for nothing.

> + local->idle_cpus <= (busiest->idle_cpus + 1)))
> + /*
> + * If the busiest group is not overloaded
> + * and there is no imbalance between this and busiest group
> + * wrt idle CPUs, it is balanced. The imbalance
> + * becomes significant if the diff is greater than 1 otherwise
> + * we might end up to just move the imbalance on another
> + * group.
> + */
> + goto out_balanced;
> +
> force_balance:
> /* Looks like there is an imbalance. Compute it */
> - env->src_grp_type = busiest->group_type;
> calculate_imbalance(env, &sds);
> +
> return env->imbalance ? sds.busiest : NULL;
>
> out_balanced:
> +
> env->imbalance = 0;
> return NULL;
> }

[...]
> @@ -8765,7 +8942,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> env.src_rq = busiest;
>
> ld_moved = 0;
> - if (busiest->cfs.h_nr_running > 1) {
> + if (busiest->nr_running > 1) {

Shouldn't that stay h_nr_running ? We can't do much if those aren't CFS
tasks.

> /*
> * Attempt to move tasks. If find_busiest_group has found
> * an imbalance but busiest->nr_running <= 1, the group is
>