Re: [PATCH 3/5] sched/fair: rework load_balance

From: Peter Zijlstra
Date: Fri Jul 19 2019 - 08:52:41 EST


On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:
> @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
> enum fbq_type { regular, remote, all };
>
> enum group_type {
> - group_other = 0,
> + group_has_spare = 0,
> + group_fully_busy,
> group_misfit_task,
> + group_asym_capacity,
> group_imbalanced,
> group_overloaded,
> };
>
> +enum group_migration {
> + migrate_task = 0,
> + migrate_util,
> + migrate_load,
> + migrate_misfit,
> +};
> +
> #define LBF_ALL_PINNED 0x01
> #define LBF_NEED_BREAK 0x02
> #define LBF_DST_PINNED 0x04
> @@ -7096,7 +7093,7 @@ struct lb_env {
> unsigned int loop_max;
>
> enum fbq_type fbq_type;
> - enum group_type src_grp_type;
> + enum group_migration src_grp_type;
> struct list_head tasks;
> };
>
> @@ -7328,7 +7325,6 @@ static int detach_tasks(struct lb_env *env)
> {
> struct list_head *tasks = &env->src_rq->cfs_tasks;
> struct task_struct *p;
> - unsigned long load;
> int detached = 0;
>
> lockdep_assert_held(&env->src_rq->lock);
> @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)
> if (!can_migrate_task(p, env))
> goto next;
>
> - load = task_h_load(p);
> + if (env->src_grp_type == migrate_load) {
> + unsigned long load = task_h_load(p);
>
> - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
> - goto next;
> + if (sched_feat(LB_MIN) &&
> + load < 16 && !env->sd->nr_balance_failed)
> + goto next;
> +
> + if ((load / 2) > env->imbalance)
> + goto next;
> +
> + env->imbalance -= load;
> + } else if (env->src_grp_type == migrate_util) {
> + unsigned long util = task_util_est(p);
> +
> + if (util > env->imbalance)
> + goto next;
> +
> + env->imbalance -= util;
> + } else if (env->src_grp_type == migrate_misfit) {
> + unsigned long util = task_util_est(p);
> +
> + /*
> + * utilization of misfit task might decrease a bit
> + * since it has been recorded. Be conservative in the
> + * condition.
> + */
> + if (2*util < env->imbalance)
> + goto next;
> +
> + env->imbalance = 0;
> + } else {
> + /* Migrate task */
> + env->imbalance--;
> + }
>
> - if ((load / 2) > env->imbalance)
> - goto next;
>
> detach_task(p, env);
> list_add(&p->se.group_node, &env->tasks);
>
> detached++;
> - env->imbalance -= load;
>
> #ifdef CONFIG_PREEMPT
> /*

Still reading through this; maybe something like so instead?

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7057,7 +7057,7 @@ enum group_type {
group_overloaded,
};

-enum group_migration {
+enum migration_type {
migrate_task = 0,
migrate_util,
migrate_load,
@@ -7094,7 +7094,7 @@ struct lb_env {
unsigned int loop_max;

enum fbq_type fbq_type;
- enum group_migration src_grp_type;
+ enum migration_type balance_type;
struct list_head tasks;
};

@@ -7325,6 +7325,7 @@ static const unsigned int sched_nr_migra
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long load, util;
struct task_struct *p;
int detached = 0;

@@ -7358,8 +7359,14 @@ static int detach_tasks(struct lb_env *e
if (!can_migrate_task(p, env))
goto next;

- if (env->src_grp_type == migrate_load) {
- unsigned long load = task_h_load(p);
+ switch (env->balance_type) {
+ case migrate_task:
+ /* Migrate task */
+ env->imbalance--;
+ break;
+
+ case migrate_load:
+ load = task_h_load(p);

if (sched_feat(LB_MIN) &&
load < 16 && !env->sd->nr_balance_failed)
@@ -7369,15 +7376,20 @@ static int detach_tasks(struct lb_env *e
goto next;

env->imbalance -= load;
- } else if (env->src_grp_type == migrate_util) {
- unsigned long util = task_util_est(p);
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);

if (util > env->imbalance)
goto next;

env->imbalance -= util;
- } else if (env->src_grp_type == migrate_misfit) {
- unsigned long util = task_util_est(p);
+ break;
+
+
+ case migrate_misfit:
+ util = task_util_est(p);

/*
* utilization of misfit task might decrease a bit
@@ -7388,9 +7400,7 @@ static int detach_tasks(struct lb_env *e
goto next;

env->imbalance = 0;
- } else {
- /* Migrate task */
- env->imbalance--;
+ break;
}


@@ -8311,7 +8321,7 @@ static inline void calculate_imbalance(s
* In case of asym capacity, we will try to migrate all load
* to the preferred CPU
*/
- env->src_grp_type = migrate_load;
+ env->balance_type = migrate_load;
env->imbalance = busiest->group_load;
return;
}
@@ -8323,14 +8333,14 @@ static inline void calculate_imbalance(s
* the imbalance. The next load balance will take care of
* balancing back the system.
*/
- env->src_grp_type = migrate_task;
+ env->balance_type = migrate_task;
env->imbalance = 1;
return;
}

if (busiest->group_type == group_misfit_task) {
/* Set imbalance to allow misfit task to be balanced. */
- env->src_grp_type = migrate_misfit;
+ env->balance_type = migrate_misfit;
env->imbalance = busiest->group_misfit_task_load;
return;
}
@@ -8346,7 +8356,7 @@ static inline void calculate_imbalance(s
* If there is no overload, we just want to even the number of
* idle cpus.
*/
- env->src_grp_type = migrate_task;
+ env->balance_type = migrate_task;
imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);

if (sds->prefer_sibling)
@@ -8365,7 +8375,7 @@ static inline void calculate_imbalance(s
* amount of load to migrate in order to balance the
* system.
*/
- env->src_grp_type = migrate_util;
+ env->balance_type = migrate_util;
imbalance = max(local->group_capacity, local->group_util) -
local->group_util;
}
@@ -8399,7 +8409,7 @@ static inline void calculate_imbalance(s
* don't want to reduce the group load below the group capacity.
* Thus we look for the minimum possible imbalance.
*/
- env->src_grp_type = migrate_load;
+ env->balance_type = migrate_load;
env->imbalance = min(
(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
@@ -8597,7 +8607,7 @@ static struct rq *find_busiest_queue(str
* For ASYM_CPUCAPACITY domains with misfit tasks we simply
* seek the "biggest" misfit task.
*/
- if (env->src_grp_type == migrate_misfit) {
+ if (env->balance_type == migrate_misfit) {
if (rq->misfit_task_load > busiest_load) {
busiest_load = rq->misfit_task_load;
busiest = rq;
@@ -8619,7 +8629,7 @@ static struct rq *find_busiest_queue(str
rq->nr_running == 1)
continue;

- if (env->src_grp_type == migrate_task) {
+ if (env->balance_type == migrate_task) {
nr_running = rq->cfs.h_nr_running;

if (busiest_nr < nr_running) {
@@ -8630,7 +8640,7 @@ static struct rq *find_busiest_queue(str
continue;
}

- if (env->src_grp_type == migrate_util) {
+ if (env->balance_type == migrate_util) {
util = cpu_util(cpu_of(rq));

if (busiest_util < util) {
@@ -8711,7 +8721,7 @@ voluntary_active_balance(struct lb_env *
return 1;
}

- if (env->src_grp_type == migrate_misfit)
+ if (env->balance_type == migrate_misfit)
return 1;

return 0;