Re: [PATCH v3 3/3] sched: update blocked load when newly idle
From: Peter Zijlstra
Date: Mon Feb 12 2018 - 07:04:22 EST
On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> !this_rq->rd->overload) {
> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> + unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
> +
> rcu_read_lock();
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> if (sd)
> update_next_balance(sd, &next_balance);
> rcu_read_unlock();
>
> + /*
> + * Update blocked idle load if it has not been done for a
> + * while. Try to do it locally before entering idle but kick a
> + * ilb if it takes too much time and/or might delay next local
> + * wake up
> + */
> + if (has_blocked && time_after_eq(jiffies, next_blocked) &&
> + (this_rq->avg_idle < sysctl_sched_migration_cost ||
> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
> + kick_ilb(NOHZ_STATS_KICK);
> +
> goto out;
> }
So I really hate this one, also I suspect its broken, because we do this
check before dropping rq->lock and _nohz_idle_balance() will take
rq->lock.
Aside from the above being an unreadable mess, I dislike that it breaks
the various isolation crud, we should not touch CPUs outside of our
domain.
Maybe something like the below? (unfinished)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7839,7 +7839,7 @@ group_type group_classify(struct sched_g
return group_other;
}
-static bool update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq, bool force)
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
@@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
- if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+ if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
update_blocked_averages(cpu);
@@ -7883,7 +7883,7 @@ static inline void update_sg_lb_stats(st
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
/* Bias balancing toward cpus of our domain */
@@ -8857,6 +8857,29 @@ update_next_balance(struct sched_domain
*next_balance = next;
}
+static int nohz_age(struct sched_domain *sd)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+ bool has_blocked_load;
+
+ WRITE_ONCE(nohz.has_blocked, 0);
+
+ smp_mb();
+
+ cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
+
+ has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(sd));
+
+ for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
+ has_blocked_load |= update_nohz_stats(rq, true);
+ }
+
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
@@ -8868,6 +8891,7 @@ static int idle_balance(struct rq *this_
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
+ bool nohz_blocked = false;
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
@@ -8889,8 +8913,8 @@ static int idle_balance(struct rq *this_
*/
rq_unpin_lock(this_rq, rf);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+short_idle:
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -8900,6 +8924,18 @@ static int idle_balance(struct rq *this_
goto out;
}
+ if (!this_rq->rd->overload) {
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned int has_blocked = READ_ONCE(nohz.has_blocked);
+ unsigned long next_blocked = READ_ONE(nohz.next_blocked);
+
+ if (has_blocked && time_after_eq(jiffies, next_blocked))
+ nohz_blocked = true;
+ else
+#endif
+ goto short_idle;
+ }
+
raw_spin_unlock(&this_rq->lock);
update_blocked_averages(this_cpu);
@@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
+ if (nohz_blocked) {
+ nohz_age(sd);
+ } else {
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+ }
domain_cost = sched_clock_cpu(this_cpu) - t0;
if (domain_cost > sd->max_newidle_lb_cost)
@@ -9497,8 +9537,7 @@ static bool nohz_idle_balance(struct rq
rq = cpu_rq(balance_cpu);
- update_blocked_averages(rq->cpu);
- has_blocked_load |= rq->has_blocked_load;
+ has_blocked_load |= update_nohz_stats(rq, true);
/*
* If time for next balance is due,