Re: [PATCH 3/5] sched/fair: rework load_balance

From: Vincent Guittot
Date: Fri Jul 19 2019 - 10:02:31 EST


On Fri, 19 Jul 2019 at 14:54, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 67f0acd..472959df 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5376,18 +5376,6 @@ static unsigned long capacity_of(int cpu)
> > return cpu_rq(cpu)->cpu_capacity;
> > }
> >
> > -static unsigned long cpu_avg_load_per_task(int cpu)
> > -{
> > - struct rq *rq = cpu_rq(cpu);
> > - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
> > - unsigned long load_avg = cpu_runnable_load(rq);
> > -
> > - if (nr_running)
> > - return load_avg / nr_running;
> > -
> > - return 0;
> > -}
> > -
> > static void record_wakee(struct task_struct *p)
> > {
> > /*
>
> > @@ -7646,7 +7669,6 @@ static unsigned long task_h_load(struct task_struct *p)
> > struct sg_lb_stats {
> > unsigned long avg_load; /*Avg load across the CPUs of the group */
> > unsigned long group_load; /* Total load over the CPUs of the group */
> > - unsigned long load_per_task;
> > unsigned long group_capacity;
> > unsigned long group_util; /* Total utilization of the group */
> > unsigned int sum_nr_running; /* Nr tasks running in the group */
>
>
> > @@ -8266,76 +8293,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> > }
> >
> > /**
> > - * fix_small_imbalance - Calculate the minor imbalance that exists
> > - * amongst the groups of a sched_domain, during
> > - * load balancing.
> > - * @env: The load balancing environment.
> > - * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
> > - */
> > -static inline
> > -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
> > -{
> > - unsigned long tmp, capa_now = 0, capa_move = 0;
> > - unsigned int imbn = 2;
> > - unsigned long scaled_busy_load_per_task;
> > - struct sg_lb_stats *local, *busiest;
> > -
> > - local = &sds->local_stat;
> > - busiest = &sds->busiest_stat;
> > -
> > - if (!local->sum_h_nr_running)
> > - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
> > - else if (busiest->load_per_task > local->load_per_task)
> > - imbn = 1;
> > -
> > - scaled_busy_load_per_task =
> > - (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
> > - busiest->group_capacity;
> > -
> > - if (busiest->avg_load + scaled_busy_load_per_task >=
> > - local->avg_load + (scaled_busy_load_per_task * imbn)) {
> > - env->imbalance = busiest->load_per_task;
> > - return;
> > - }
> > -
> > - /*
> > - * OK, we don't have enough imbalance to justify moving tasks,
> > - * however we may be able to increase total CPU capacity used by
> > - * moving them.
> > - */
> > -
> > - capa_now += busiest->group_capacity *
> > - min(busiest->load_per_task, busiest->avg_load);
> > - capa_now += local->group_capacity *
> > - min(local->load_per_task, local->avg_load);
> > - capa_now /= SCHED_CAPACITY_SCALE;
> > -
> > - /* Amount of load we'd subtract */
> > - if (busiest->avg_load > scaled_busy_load_per_task) {
> > - capa_move += busiest->group_capacity *
> > - min(busiest->load_per_task,
> > - busiest->avg_load - scaled_busy_load_per_task);
> > - }
> > -
> > - /* Amount of load we'd add */
> > - if (busiest->avg_load * busiest->group_capacity <
> > - busiest->load_per_task * SCHED_CAPACITY_SCALE) {
> > - tmp = (busiest->avg_load * busiest->group_capacity) /
> > - local->group_capacity;
> > - } else {
> > - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
> > - local->group_capacity;
> > - }
> > - capa_move += local->group_capacity *
> > - min(local->load_per_task, local->avg_load + tmp);
> > - capa_move /= SCHED_CAPACITY_SCALE;
> > -
> > - /* Move if we gain throughput */
> > - if (capa_move > capa_now)
> > - env->imbalance = busiest->load_per_task;
> > -}
> > -
> > -/**
> > * calculate_imbalance - Calculate the amount of imbalance present within the
> > * groups of a given sched_domain during load balance.
> > * @env: load balance environment
>
> Maybe strip this out first, in a separate patch. It's all magic doo-doo.

If I remove that 1st, we will have commit in the branch that might
regress some performance temporarily and bisect might spot it while
looking for a culprit, isn't it ?