Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

From: Dietmar Eggemann
Date: Mon Jan 22 2018 - 04:40:52 EST


On 01/15/2018 08:26 AM, Vincent Guittot wrote:
Le Wednesday 03 Jan 2018 Ã 10:16:00 (+0100), Vincent Guittot a Ãcrit :
Hi Peter,

On 22 December 2017 at 21:42, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
Right; but I figured we'd try and do it 'right' and see how horrible it
is before we try and do funny things.

So now it should have a 32ms tick for up to .5s when the system goes
completely idle.

No idea how bad that is..

I have tested your branch but the timer doesn't seem to fire correctly
because i can still see blocked load in the use case i have run.
I haven't found the reason yet

Isn't the issue with the timer based implementation that rq->has_blocked_load is never set to 1 ?

Something you changed in your implementation by adding a rq->has_blocked_load = 1 into nohz_balance_enter_idle().


Hi Peter,

With the patch below on top of your branch, the blocked loads are updated and
decayed regularly. The main differences are:
- It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
The main drawback of this solution is that the load is blocked when the
system is fully idle with the advantage of not waking up a fully idle
system. We have to wait for the next tick or newly idle event for updating
blocked load when the system leaves idle stat which can be up to a tick long.
If this is too long, we can check for kicking ilb when task wakes up so the
blocked load will be updated as soon as the system leaves idle state.
The main advantage is that we don't wake up a fully idle system every 32ms to
update blocked load that will be not used.
- I'm working on one more improvement to use nohz_idle_balance in the newly
idle case when the system is not overloaded and
(this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
of an idle cpus.

---
kernel/sched/fair.c | 72 +++++++++++++++++++++++++----------------------------
1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52114c6..898785d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5386,7 +5386,6 @@ static struct {
atomic_t stats_state;
unsigned long next_balance; /* in jiffy units */
unsigned long next_stats;
- struct timer_list timer;
} nohz ____cacheline_aligned;
#endif /* CONFIG_NO_HZ_COMMON */
@@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
prefer_sibling = 1;
#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
env->flags |= LBF_NOHZ_STATS;
+ }
#endif
load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
*next_balance = next;
}
+static void kick_ilb(unsigned int flags);
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
@@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
+ unsigned long next = READ_ONCE(nohz.next_stats);
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
+ if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
+ kick_ilb(NOHZ_STATS_KICK);
+
goto out;
}
@@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
smp_send_reschedule(ilb_cpu);
}
-void nohz_balance_timer(struct timer_list *timer)
-{
- unsigned long next = READ_ONCE(nohz.next_stats);
-
- if (time_before(jiffies, next)) {
- mod_timer(timer, next);
- return;
- }
-
- kick_ilb(NOHZ_STATS_KICK);
-}
-
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
@@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
+ if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
+ flags = NOHZ_STATS_KICK;
+
if (time_before(now, nohz.next_balance))
goto out;
@@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned int val, new;
SCHED_WARN_ON(cpu != smp_processor_id());
@@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
return;
rq->nohz_tick_stopped = 1;
+ rq->has_blocked_load = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
@@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
set_cpu_sd_state_idle(cpu);
/*
- * implies a barrier such that if the stats_state update is observed
- * the above updates are also visible. Pairs with stuff in
- * update_sd_lb_stats() and nohz_idle_balance().
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle cpus
*/
- val = atomic_read(&nohz.stats_state);
- do {
- new = val + 2;
- new |= 1;
- } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
+ atomic_set(&nohz.stats_state, 1);
- /*
- * If the timer was stopped, restart the thing.
- */
- if (!(val & 1))
- mod_timer(&nohz.timer, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
}
#else
static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int stats_seq;
unsigned int flags;
int balance_cpu;
struct rq *rq;
@@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return false;
}
- stats_seq = atomic_read(&nohz.stats_state);
/*
* barrier, pairs with nohz_balance_enter_idle(), ensures ...
*/
@@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+ /*
+ * We assume there will be no idle load after this update and clear
+ * the stats state. If a cpu enters idle in the mean time, it will
+ * set the stats state and trig another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the stats state, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ */
+ atomic_set(&nohz.stats_state, 0);
+
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
@@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched())
+ if (need_resched()) {
+ has_blocked_load = true;
break;
+ }
rq = cpu_rq(balance_cpu);
@@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
- if (has_blocked_load ||
- !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
- WRITE_ONCE(nohz.next_stats,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
- mod_timer(&nohz.timer, nohz.next_stats);
- }
+ WRITE_ONCE(nohz.next_stats,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ atomic_set(&nohz.stats_state, 1);
/*
* next_balance will be updated only when there is a need.
@@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
nohz.next_balance = jiffies;
nohz.next_stats = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- timer_setup(&nohz.timer, nohz_balance_timer, 0);
#endif
#endif /* SMP */