[tip: sched/core] sched/fair: Wait before decaying max_newidle_lb_cost

From: tip-bot2 for Vincent Guittot
Date: Sun Oct 31 2021 - 06:21:29 EST


The following commit has been merged into the sched/core branch of tip:

Commit-ID: e60b56e46b384cee1ad34e6adc164d883049c6c3
Gitweb: https://git.kernel.org/tip/e60b56e46b384cee1ad34e6adc164d883049c6c3
Author: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
AuthorDate: Tue, 19 Oct 2021 14:35:35 +02:00
Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CommitterDate: Sun, 31 Oct 2021 11:11:38 +01:00

sched/fair: Wait before decaying max_newidle_lb_cost

Decay max_newidle_lb_cost only when it has not been updated for a while
and ensure to not decay a recently changed value.

Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Acked-by: Mel Gorman <mgorman@xxxxxxx>
Link: https://lore.kernel.org/r/20211019123537.17146-4-vincent.guittot@xxxxxxxxxx
---
include/linux/sched/topology.h | 2 +-
kernel/sched/fair.c | 36 ++++++++++++++++++++++++---------
kernel/sched/topology.c | 2 +-
3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2f9166f..c07bfa2 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -105,7 +105,7 @@ struct sched_domain {

/* idle_balance() stats */
u64 max_newidle_lb_cost;
- unsigned long next_decay_max_lb_cost;
+ unsigned long last_decay_max_lb_cost;

u64 avg_scan_cost; /* select_idle_sibling */

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c4c3686..e50fd75 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10239,6 +10239,30 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}

+static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+{
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = jiffies;
+ } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+ sd->last_decay_max_lb_cost = jiffies;
+
+ return true;
+ }
+
+ return false;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -10262,14 +10286,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
+ * visit to all the domains.
*/
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
- }
+ need_decay = update_newidle_cost(sd, 0);
max_cost += sd->max_newidle_lb_cost;

/*
@@ -10911,8 +10930,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
+ update_newidle_cost(sd, domain_cost);

curr_cost += domain_cost;
t0 = t1;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e812467..30169c7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1568,7 +1568,7 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
.max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
+ .last_decay_max_lb_cost = jiffies,
.child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,