[PATCH v2 2/2] sched/fair: Update blocked load from newly idle balance

From: Brendan Jackman
Date: Fri Dec 01 2017 - 13:02:10 EST


We now have a NOHZ kick to avoid the load of idle CPUs becoming
stale. This is good, but it brings about CPU wakeups, which have an
energy cost. As an alternative to waking CPUs up to do decay blocked
load, we can sometimes do it from newly idle balance. If the newly
idle balance is on a domain that covers all the currently nohz-idle
CPUs, we push the value of nohz.next_update into the future. That
means that if such newly idle balances happen often enough, we never
need wake up a CPU just to update load.

Since we're doing this new update inside a for_each_domain, we need
to do something to avoid doing multiple updates on the same CPU in
the same idle_balance. A tick stamp is set on the rq in
update_blocked_averages as a simple way to do this. Using a simple
jiffies-based timestamp, as opposed to the last_update_time of the
root cfs_rq's sched_avg, means we can do this without taking the rq
lock.

Change-Id: If9d4e14d7b4da86e05474b5c125d91d9b47f9e93
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Morten Rasmussen <morten.rasmussen@xxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Brendan Jackman <brendan.jackman@xxxxxxx>
---
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 1 +
3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..715d2b9ea7e2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5949,6 +5949,7 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
+ rq->last_blocked_load_update_tick = jiffies;
rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f83e8f0d4f06..011dcfa1bb5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7343,6 +7343,9 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
}
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+#endif
rq_unlock_irqrestore(rq, &rf);
}

@@ -7402,6 +7405,9 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+#endif
rq_unlock_irqrestore(rq, &rf);
}

@@ -7896,6 +7902,15 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */

+#ifdef CONFIG_NO_HZ_COMMON
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+ unsigned long next_update; /* in jiffy units */
+} nohz ____cacheline_aligned;
+#endif
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -7913,6 +7928,29 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;

+#ifdef CONFIG_NO_HZ_COMMON
+ if (env->idle == CPU_NEWLY_IDLE) {
+ int cpu;
+
+ /* Update the stats of NOHZ idle CPUs in the sd */
+ for_each_cpu_and(cpu, sched_domain_span(env->sd),
+ nohz.idle_cpus_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ /* ... Unless we've already done since the last tick */
+ if (time_after(jiffies, rq->last_blocked_load_update_tick))
+ update_blocked_averages(cpu);
+ }
+ }
+ /*
+ * If we've just updated all of the NOHZ idle CPUs, then we can push
+ * back the next nohz.next_update, which will prevent an unnecessary
+ * wakeup for the nohz stats kick
+ */
+ if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
+ nohz.next_update = jiffies + LOAD_AVG_PERIOD;
+#endif
+
load_idx = get_sd_load_idx(env->sd, env->idle);

do {
@@ -8931,12 +8969,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
- unsigned long next_update; /* in jiffy units */
-} nohz ____cacheline_aligned;

static inline int find_new_ilb(void)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a7e48b6ee68c..40fa579a4649 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -696,6 +696,7 @@ struct rq {
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
+ unsigned long last_blocked_load_update_tick;
#endif /* CONFIG_SMP */
unsigned long nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
--
2.14.1