On 3/13/2025 5:37 PM, K Prateek Nayak wrote:
The load balancer will start caching the sg_lb_stats during load
balancing and propagate it up the sched domain hierarchy in the
subsequent commits.
Increase the probability of load balancing intervals across domains to
be aligned to improve the reuse efficiency of the propagated stats.
Go one step further and proactively explore balancing at a higher domain
if the next update time for a higher domain in before the next update
time for its children.
Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/fair.c | 18 +++++++-----------
1 file changed, 7 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b1ed14e4b5e..60517a732c10 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11956,15 +11956,6 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
-
- /*
- * Reduce likelihood of busy balancing at higher domains racing with
- * balancing at lower domains by preventing their balancing periods
- * from being multiples of each other.
- */
- if (cpu_busy)
- interval -= 1;
-
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@@ -12126,7 +12117,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
int continue_balancing = 1;
int cpu = rq->cpu;
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
- unsigned long interval;
+ unsigned long interval, prev_sd_next_balance = 0;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
@@ -12136,6 +12127,8 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
for_each_domain(cpu, sd) {
+ unsigned long next_interval;
+
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains.
@@ -12162,7 +12155,9 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
goto out;
}
- if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ next_interval = sd->last_balance + interval;
+ if (time_after_eq(jiffies, next_interval) ||
+ (prev_sd_next_balance && time_after(prev_sd_next_balance, next_interval))) {
(prev_sd_next_balance && time_after(jiffies, prev_sd_next_balance))?
thanks,
Chenyu
if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
@@ -12174,6 +12169,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
+ prev_sd_next_balance = sd->last_balance + interval;
}
if (need_serialize)
atomic_set_release(&sched_balance_running, 0);