[PATCH 05/11 v3] sched/fair: Use the new cfs_rq.h_nr_runnable

From: Vincent Guittot
Date: Mon Dec 02 2024 - 12:53:44 EST


Use the new h_nr_runnable that tracks only queued and runnable tasks in the
statistics that are used to balance the system:
- PELT runnable_avg
- deciding if a group is overloaded or has spare capacity
- numa stats
- reduced capacity management
- load balance
- nohz kick

It should be noticed that the rq->nr_running still counts the delayed
dequeued tasks as delayed dequeue is a fair feature that is meaningless
at core level.

Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
---
kernel/sched/fair.c | 18 +++++++++---------
kernel/sched/pelt.c | 4 ++--
kernel/sched/sched.h | 7 ++-----
3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46cf1c72598c..e3c89aeda73f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2128,7 +2128,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
- ns->nr_running += rq->cfs.h_nr_queued;
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);

if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@@ -5394,7 +5394,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_queued of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -5534,7 +5534,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_queued of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@@ -10335,7 +10335,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
- if (rq->cfs.h_nr_queued != 1)
+ if (rq->cfs.h_nr_runnable != 1)
return false;

return check_cpu_capacity(rq, sd);
@@ -10370,7 +10370,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_queued;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;

nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
@@ -10685,7 +10685,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_queued - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;

nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -11467,7 +11467,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;

- nr_running = rq->cfs.h_nr_queued;
+ nr_running = rq->cfs.h_nr_runnable;
if (!nr_running)
continue;

@@ -11626,7 +11626,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
- (env->src_rq->cfs.h_nr_queued == 1)) {
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -12369,7 +12369,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_queued >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 2bad0b508dfc..7a8534a2deff 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = grq->h_nr_queued
+ * se_runnable() = grq->h_nr_runnable
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed,
+ cfs_rq->h_nr_runnable,
cfs_rq->curr != NULL)) {

___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 869d5d3521f2..4374c660f5c7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -900,11 +900,8 @@ struct dl_rq {

static inline void se_update_runnable(struct sched_entity *se)
{
- if (!entity_is_task(se)) {
- struct cfs_rq *cfs_rq = se->my_q;
-
- se->runnable_weight = cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed;
- }
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_runnable;
}

static inline long se_runnable(struct sched_entity *se)
--
2.43.0