Re: [PATCH v4 7/7] sched/fair: de-entropy for SIS filter

From: Chen Yu
Date: Tue Jun 21 2022 - 14:23:57 EST


On Sun, Jun 19, 2022 at 08:04:51PM +0800, Abel Wu wrote:
> Now when updating core state, there are two main problems that could
> pollute the SIS filter:
>
> - The updating is before task migration, so if dst_cpu is
> selected to be propagated which might be fed with tasks
> soon, the efforts we paid is no more than setting a busy
> cpu into the SIS filter. While on the other hand it is
> important that we update as early as possible to keep the
> filter fresh, so it's not wise to delay the update to the
> end of load balancing.
>
> - False negative propagation hurts performance since some
> idle cpus could be out of reach. So in general we will
> aggressively propagate idle cpus but allow false positive
> continue to exist for a while, which may lead to filter
> being fully polluted.
>
> Pains can be relieved by a force correction when false positive
> continuously detected.
>
> Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
> ---
> include/linux/sched/topology.h | 7 +++++
> kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++--
> 2 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index b93edf587d84..e3552ce192a9 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -91,6 +91,12 @@ struct sched_group;
> * search, and is also used as a fallback state of the other
> * states.
> *
> + * - sd_may_idle
> + * This state implies the unstableness of the SIS filter, and
> + * some bits of it may out of date. This state is only used in
> + * SMT domains as an intermediate state between sd_has_icpus
> + * and sd_is_busy.
> + *
> * - sd_is_busy
> * This state indicates there are no unoccupied cpus in this
> * domain. So for LLC domains, it gives the hint on whether
> @@ -111,6 +117,7 @@ struct sched_group;
> enum sd_state {
> sd_has_icores,
> sd_has_icpus,
> + sd_may_idle,
> sd_is_busy
> };
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d55fdcedf2c0..9713d183d35e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8768,6 +8768,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
> + bool update = update_core && (env->dst_cpu != i);
>
> sgs->group_load += cpu_load(rq);
> sgs->group_util += cpu_util_cfs(i);
> @@ -8777,7 +8778,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> nr_running = rq->nr_running;
> sgs->sum_nr_running += nr_running;
>
> - if (update_core)
> + /*
> + * The dst_cpu is not preferred since it might
> + * be fed with tasks soon.
> + */
> + if (update)
maybe if (update_core && (env->dst_cpu != i)) so that the comment would
be near the code logic and meanwhile without introducing a update variable?
> sd_classify(sds, rq, i);
>
> if (nr_running > 1)
> @@ -8801,7 +8806,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> * and fed with tasks, so we'd better choose
> * a candidate in an opposite way.
> */
> - sds->idle_cpu = i;
> + if (update)
> + sds->idle_cpu = i;
> sgs->idle_cpus++;
>
> /* Idle cpu can't have misfit task */
> @@ -9321,7 +9327,7 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
> {
> struct sched_domain_shared *sd_smt_shared = env->sd->shared;
> enum sd_state new = sds->sd_state;
> - int this = env->dst_cpu;
> + int icpu = sds->idle_cpu, this = env->dst_cpu;
>
> /*
> * Parallel updating can hardly contribute accuracy to
> @@ -9331,6 +9337,22 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
> if (cmpxchg(&sd_smt_shared->updating, 0, 1))
> return;
>
> + /*
> + * The dst_cpu is likely to be fed with tasks soon.
> + * If it is the only unoccupied cpu in this domain,
> + * we still handle it the same way as as_has_icpus
> + * but turn the SMT into the unstable state, rather
> + * than waiting to the end of load balancing since
> + * it's also important that update the filter as
> + * early as possible to keep it fresh.
> + */
> + if (new == sd_is_busy) {
> + if (idle_cpu(this) || sched_idle_cpu(this)) {
available_idle_cpu()?

thanks,
Chenyu
> + new = sd_may_idle;
> + icpu = this;
> + }
> + }
> +
> /*
> * There is at least one unoccupied cpu available, so
> * propagate it to the filter to avoid false negative
> @@ -9338,6 +9360,12 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
> * idle cpus thus throughupt downgraded.
> */
> if (new != sd_is_busy) {
> + /*
> + * The sd_may_idle state is taken into
> + * consideration as well because from
> + * here we couldn't actually know task
> + * migrations would happen or not.
> + */
> if (!test_idle_cpus(this))
> set_idle_cpus(this, true);
> } else {
> @@ -9347,9 +9375,26 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
> */
> if (sd_smt_shared->state == sd_is_busy)
> goto out;
> +
> + /*
> + * Allow false positive to exist for some time
> + * to make a tradeoff of accuracy of the filter
> + * for relieving cache traffic.
> + */
> + if (sd_smt_shared->state == sd_has_icpus) {
> + new = sd_may_idle;
> + goto update;
> + }
> +
> + /*
> + * If the false positive issue has already been
> + * there for a while, a correction of the filter
> + * is needed.
> + */
> }
>
> sd_update_icpus(this, sds->idle_cpu);
> +update:
> sd_smt_shared->state = new;
> out:
> xchg(&sd_smt_shared->updating, 0);
> --
> 2.31.1
>