Re: [PATCH 1/7 v5] sched/fair: Filter false overloaded_group case for EAS
From: Vincent Guittot
Date: Wed Mar 05 2025 - 03:13:41 EST
Hi Prateek,
On Tue, 4 Mar 2025 at 05:38, K Prateek Nayak <kprateek.nayak@xxxxxxx> wrote:
>
> Hello Vincent,
>
> On 3/3/2025 2:35 AM, Vincent Guittot wrote:
> > With EAS, a group should be set overloaded if at least 1 CPU in the group
> > is overutilized but it can happen that a CPU is fully utilized by tasks
> > because of clamping the compute capacity of the CPU. In such case, the CPU
> > is not overutilized and as a result should not be set overloaded as well.
> >
> > group_overloaded being a higher priority than group_misfit, such group can
> > be selected as the busiest group instead of a group with a mistfit task
> > and prevents load_balance to select the CPU with the misfit task to pull
> > the latter on a fitting CPU.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> > Tested-by: Pierre Gondois <pierre.gondois@xxxxxxx>
> > ---
> > kernel/sched/fair.c | 12 +++++++++++-
> > 1 file changed, 11 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 857808da23d8..d3d1a2ba6b1a 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9931,6 +9931,7 @@ struct sg_lb_stats {
> > unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
> > unsigned int group_smt_balance; /* Task on busy SMT be moved */
> > unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
> > + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
> > #ifdef CONFIG_NUMA_BALANCING
> > unsigned int nr_numa_running;
> > unsigned int nr_preferred_running;
> > @@ -10163,6 +10164,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> > static inline bool
> > group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> > {
> > + /*
> > + * With EAS and uclamp, 1 CPU in the group must be overutilized to
> > + * consider the group overloaded.
> > + */
> > + if (sched_energy_enabled() && !sgs->group_overutilized)
> > + return false;
> > +
> > if (sgs->sum_nr_running <= sgs->group_weight)
> > return false;
> >
> > @@ -10374,8 +10382,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> > nr_running = rq->nr_running;
> > sgs->sum_nr_running += nr_running;
> >
> > - if (cpu_overutilized(i))
> > + if (cpu_overutilized(i)) {
> > *sg_overutilized = 1;
>
> Since sgs->overutilized is tracking the overutilized status, can we get
> avoid passing the "sg_overutilized" pointer to update_sg_lb_stats() and
> just use the sg->overutilized in update_sd_lb_stats()?
yes, make sense
>
> Something like below:
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 857808da23d8..de4a7e19d383 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10346,14 +10346,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
> * @group: sched_group whose statistics are to be updated.
> * @sgs: variable to hold the statistics for this group.
> * @sg_overloaded: sched_group is overloaded
> - * @sg_overutilized: sched_group is overutilized
> */
> static inline void update_sg_lb_stats(struct lb_env *env,
> struct sd_lb_stats *sds,
> struct sched_group *group,
> struct sg_lb_stats *sgs,
> - bool *sg_overloaded,
> - bool *sg_overutilized)
> + bool *sg_overloaded)
> {
> int i, nr_running, local_group, sd_flags = env->sd->flags;
> bool balancing_at_rd = !env->sd->parent;
> @@ -10375,7 +10373,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> sgs->sum_nr_running += nr_running;
>
> if (cpu_overutilized(i))
> - *sg_overutilized = 1;
> + sgs->group_overutilized = 1;
>
> /*
> * No need to call idle_cpu() if nr_running is not 0
> @@ -11046,7 +11044,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> update_group_capacity(env->sd, env->dst_cpu);
> }
>
> - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
> + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
>
> if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
> sds->busiest = sg;
> @@ -11056,6 +11054,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> /* Now, start updating sd_lb_stats */
> sds->total_load += sgs->group_load;
> sds->total_capacity += sgs->group_capacity;
> + sg_overutilized |= sgs->group_overutilized;
>
> sum_util += sgs->group_util;
> sg = sg->next;
> --
> Thanks and Regards,
> Prateek
>
> > + sgs->group_overutilized = 1;
> > + }
> >
> > /*
> > * No need to call idle_cpu() if nr_running is not 0
>