Re: [PATCH 3/5] sched/fair: rework load_balance

From: Vincent Guittot
Date: Fri Jul 19 2019 - 09:46:38 EST


On Fri, 19 Jul 2019 at 14:52, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:
> > @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
> > enum fbq_type { regular, remote, all };
> >
> > enum group_type {
> > - group_other = 0,
> > + group_has_spare = 0,
> > + group_fully_busy,
> > group_misfit_task,
> > + group_asym_capacity,
> > group_imbalanced,
> > group_overloaded,
> > };
> >
> > +enum group_migration {
> > + migrate_task = 0,
> > + migrate_util,
> > + migrate_load,
> > + migrate_misfit,
> > +};
> > +
> > #define LBF_ALL_PINNED 0x01
> > #define LBF_NEED_BREAK 0x02
> > #define LBF_DST_PINNED 0x04
> > @@ -7096,7 +7093,7 @@ struct lb_env {
> > unsigned int loop_max;
> >
> > enum fbq_type fbq_type;
> > - enum group_type src_grp_type;
> > + enum group_migration src_grp_type;
> > struct list_head tasks;
> > };
> >
> > @@ -7328,7 +7325,6 @@ static int detach_tasks(struct lb_env *env)
> > {
> > struct list_head *tasks = &env->src_rq->cfs_tasks;
> > struct task_struct *p;
> > - unsigned long load;
> > int detached = 0;
> >
> > lockdep_assert_held(&env->src_rq->lock);
> > @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)
> > if (!can_migrate_task(p, env))
> > goto next;
> >
> > - load = task_h_load(p);
> > + if (env->src_grp_type == migrate_load) {
> > + unsigned long load = task_h_load(p);
> >
> > - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
> > - goto next;
> > + if (sched_feat(LB_MIN) &&
> > + load < 16 && !env->sd->nr_balance_failed)
> > + goto next;
> > +
> > + if ((load / 2) > env->imbalance)
> > + goto next;
> > +
> > + env->imbalance -= load;
> > + } else if (env->src_grp_type == migrate_util) {
> > + unsigned long util = task_util_est(p);
> > +
> > + if (util > env->imbalance)
> > + goto next;
> > +
> > + env->imbalance -= util;
> > + } else if (env->src_grp_type == migrate_misfit) {
> > + unsigned long util = task_util_est(p);
> > +
> > + /*
> > + * utilization of misfit task might decrease a bit
> > + * since it has been recorded. Be conservative in the
> > + * condition.
> > + */
> > + if (2*util < env->imbalance)
> > + goto next;
> > +
> > + env->imbalance = 0;
> > + } else {
> > + /* Migrate task */
> > + env->imbalance--;
> > + }
> >
> > - if ((load / 2) > env->imbalance)
> > - goto next;
> >
> > detach_task(p, env);
> > list_add(&p->se.group_node, &env->tasks);
> >
> > detached++;
> > - env->imbalance -= load;
> >
> > #ifdef CONFIG_PREEMPT
> > /*
>
> Still reading through this; maybe something like so instead?

yes, looks easier to read
>
> ---
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7057,7 +7057,7 @@ enum group_type {
> group_overloaded,
> };
>
> -enum group_migration {
> +enum migration_type {
> migrate_task = 0,
> migrate_util,
> migrate_load,
> @@ -7094,7 +7094,7 @@ struct lb_env {
> unsigned int loop_max;
>
> enum fbq_type fbq_type;
> - enum group_migration src_grp_type;
> + enum migration_type balance_type;
> struct list_head tasks;
> };
>
> @@ -7325,6 +7325,7 @@ static const unsigned int sched_nr_migra
> static int detach_tasks(struct lb_env *env)
> {
> struct list_head *tasks = &env->src_rq->cfs_tasks;
> + unsigned long load, util;
> struct task_struct *p;
> int detached = 0;
>
> @@ -7358,8 +7359,14 @@ static int detach_tasks(struct lb_env *e
> if (!can_migrate_task(p, env))
> goto next;
>
> - if (env->src_grp_type == migrate_load) {
> - unsigned long load = task_h_load(p);
> + switch (env->balance_type) {
> + case migrate_task:
> + /* Migrate task */
> + env->imbalance--;
> + break;
> +
> + case migrate_load:
> + load = task_h_load(p);
>
> if (sched_feat(LB_MIN) &&
> load < 16 && !env->sd->nr_balance_failed)
> @@ -7369,15 +7376,20 @@ static int detach_tasks(struct lb_env *e
> goto next;
>
> env->imbalance -= load;
> - } else if (env->src_grp_type == migrate_util) {
> - unsigned long util = task_util_est(p);
> + break;
> +
> + case migrate_util:
> + util = task_util_est(p);
>
> if (util > env->imbalance)
> goto next;
>
> env->imbalance -= util;
> - } else if (env->src_grp_type == migrate_misfit) {
> - unsigned long util = task_util_est(p);
> + break;
> +
> +
> + case migrate_misfit:
> + util = task_util_est(p);
>
> /*
> * utilization of misfit task might decrease a bit
> @@ -7388,9 +7400,7 @@ static int detach_tasks(struct lb_env *e
> goto next;
>
> env->imbalance = 0;
> - } else {
> - /* Migrate task */
> - env->imbalance--;
> + break;
> }
>
>
> @@ -8311,7 +8321,7 @@ static inline void calculate_imbalance(s
> * In case of asym capacity, we will try to migrate all load
> * to the preferred CPU
> */
> - env->src_grp_type = migrate_load;
> + env->balance_type = migrate_load;
> env->imbalance = busiest->group_load;
> return;
> }
> @@ -8323,14 +8333,14 @@ static inline void calculate_imbalance(s
> * the imbalance. The next load balance will take care of
> * balancing back the system.
> */
> - env->src_grp_type = migrate_task;
> + env->balance_type = migrate_task;
> env->imbalance = 1;
> return;
> }
>
> if (busiest->group_type == group_misfit_task) {
> /* Set imbalance to allow misfit task to be balanced. */
> - env->src_grp_type = migrate_misfit;
> + env->balance_type = migrate_misfit;
> env->imbalance = busiest->group_misfit_task_load;
> return;
> }
> @@ -8346,7 +8356,7 @@ static inline void calculate_imbalance(s
> * If there is no overload, we just want to even the number of
> * idle cpus.
> */
> - env->src_grp_type = migrate_task;
> + env->balance_type = migrate_task;
> imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);
>
> if (sds->prefer_sibling)
> @@ -8365,7 +8375,7 @@ static inline void calculate_imbalance(s
> * amount of load to migrate in order to balance the
> * system.
> */
> - env->src_grp_type = migrate_util;
> + env->balance_type = migrate_util;
> imbalance = max(local->group_capacity, local->group_util) -
> local->group_util;
> }
> @@ -8399,7 +8409,7 @@ static inline void calculate_imbalance(s
> * don't want to reduce the group load below the group capacity.
> * Thus we look for the minimum possible imbalance.
> */
> - env->src_grp_type = migrate_load;
> + env->balance_type = migrate_load;
> env->imbalance = min(
> (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
> (sds->avg_load - local->avg_load) * local->group_capacity
> @@ -8597,7 +8607,7 @@ static struct rq *find_busiest_queue(str
> * For ASYM_CPUCAPACITY domains with misfit tasks we simply
> * seek the "biggest" misfit task.
> */
> - if (env->src_grp_type == migrate_misfit) {
> + if (env->balance_type == migrate_misfit) {
> if (rq->misfit_task_load > busiest_load) {
> busiest_load = rq->misfit_task_load;
> busiest = rq;
> @@ -8619,7 +8629,7 @@ static struct rq *find_busiest_queue(str
> rq->nr_running == 1)
> continue;
>
> - if (env->src_grp_type == migrate_task) {
> + if (env->balance_type == migrate_task) {
> nr_running = rq->cfs.h_nr_running;
>
> if (busiest_nr < nr_running) {
> @@ -8630,7 +8640,7 @@ static struct rq *find_busiest_queue(str
> continue;
> }
>
> - if (env->src_grp_type == migrate_util) {
> + if (env->balance_type == migrate_util) {
> util = cpu_util(cpu_of(rq));
>
> if (busiest_util < util) {
> @@ -8711,7 +8721,7 @@ voluntary_active_balance(struct lb_env *
> return 1;
> }
>
> - if (env->src_grp_type == migrate_misfit)
> + if (env->balance_type == migrate_misfit)
> return 1;
>
> return 0;