Re: [PATCH 2/2] sched/fair: Update blocked load from newly idle balance

From: Vincent Guittot
Date: Mon Nov 20 2017 - 04:08:19 EST


On 24 October 2017 at 14:25, Brendan Jackman <brendan.jackman@xxxxxxx> wrote:
> We now have a NOHZ kick to avoid the load of idle CPUs becoming stale. This is
> good, but it brings about CPU wakeups, which have an energy cost. As an
> alternative to waking CPUs up to do decay blocked load, we can sometimes do it
> from newly idle balance. If the newly idle balance is on a domain that covers
> all the currently nohz-idle CPUs, we push the value of nohz.next_update into the
> future. That means that if such newly idle balances happen often enough, we
> never need wake up a CPU just to update load.
>
> Since we're doing this new update inside a for_each_domain, we need to do
> something to avoid doing multiple updates on the same CPU in the same
> idle_balance. A tick stamp is set on the rq in update_blocked_averages as a
> simple way to do this. Using a simple jiffies-based timestamp, as opposed to the
> last_update_time of the root cfs_rq's sched_avg, means we can do this without
> taking the rq lock.
>
> Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
> Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Morten Rasmussen <morten.rasmussen@xxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Signed-off-by: Brendan Jackman <brendan.jackman@xxxxxxx>
> ---
> kernel/sched/core.c | 1 +
> kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++------
> kernel/sched/sched.h | 1 +
> 3 files changed, 37 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d17c5da523a0..d8e71fd27806 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5923,6 +5923,7 @@ void __init sched_init(void)
> rq_attach_root(rq, &def_root_domain);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_load_update_tick = jiffies;
> + rq->last_blocked_load_update_tick = jiffies;
> rq->nohz_flags = 0;
> #endif
> #ifdef CONFIG_NO_HZ_FULL
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9085caf49c76..45e9c8056161 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7062,6 +7062,7 @@ static void update_blocked_averages(int cpu)
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> }
> + rq->last_blocked_load_update_tick = jiffies;

last_blocked_load_update_tick is defined under CONFIG_NO_HZ_COMMON and
CONFIG_SMP
whereas update_blocked_averages() is not. This generates a compilation error

> rq_unlock_irqrestore(rq, &rf);
> }
>
> @@ -7121,6 +7122,7 @@ static inline void update_blocked_averages(int cpu)
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> + rq->last_blocked_load_update_tick = jiffies;
> rq_unlock_irqrestore(rq, &rf);
> }
>
> @@ -7615,6 +7617,15 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
> }
> #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +static struct {
> + cpumask_var_t idle_cpus_mask;
> + atomic_t nr_cpus;
> + unsigned long next_balance; /* in jiffy units */
> + unsigned long next_update; /* in jiffy units */
> +} nohz ____cacheline_aligned;
> +#endif
> +
> /**
> * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
> * @env: The load balancing environment.
> @@ -7633,6 +7644,30 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> if (child && child->flags & SD_PREFER_SIBLING)
> prefer_sibling = 1;
>
> +#ifdef CONFIG_NO_HZ_COMMON
> + if (env->idle == CPU_NEWLY_IDLE) {
> + int cpu;
> +
> + /* Update the stats of NOHZ idle CPUs in the sd */
> + for_each_cpu_and(cpu, sched_domain_span(env->sd),
> + nohz.idle_cpus_mask) {
> + struct rq *rq = cpu_rq(cpu);
> +
> + /* ... Unless we've already done since the last tick */
> + if (time_after(jiffies,
> + rq->last_blocked_load_update_tick))
> + update_blocked_averages(cpu);
> + }
> + }
> + /*
> + * If we've just updated all of the NOHZ idle CPUs, then we can push
> + * back the next nohz.next_update, which will prevent an unnecessary
> + * wakeup for the nohz stats kick
> + */
> + if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
> + nohz.next_update = jiffies + LOAD_AVG_PERIOD;
> +#endif
> +
> load_idx = get_sd_load_idx(env->sd, env->idle);
>
> do {
> @@ -8657,12 +8692,6 @@ static inline int on_null_domain(struct rq *rq)
> * needed, they will kick the idle load balancer, which then does idle
> * load balancing for all the idle CPUs.
> */
> -static struct {
> - cpumask_var_t idle_cpus_mask;
> - atomic_t nr_cpus;
> - unsigned long next_balance; /* in jiffy units */
> - unsigned long next_update; /* in jiffy units */
> -} nohz ____cacheline_aligned;
>
> static inline int find_new_ilb(void)
> {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6f95ef653f73..6be8938bb977 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -681,6 +681,7 @@ struct rq {
> #ifdef CONFIG_NO_HZ_COMMON
> #ifdef CONFIG_SMP
> unsigned long last_load_update_tick;
> + unsigned long last_blocked_load_update_tick;
> #endif /* CONFIG_SMP */
> unsigned long nohz_flags;
> #endif /* CONFIG_NO_HZ_COMMON */
> --
> 2.14.1
>