[PATCH 5.11 210/775] sched/fair: Avoid stale CPU util_est value for schedutil in task dequeue

From: Greg Kroah-Hartman
Date: Mon Mar 01 2021 - 19:59:39 EST


From: Xuewen Yan <xuewen.yan@xxxxxxxxxx>

[ Upstream commit 8c1f560c1ea3f19e22ba356f62680d9d449c9ec2 ]

CPU (root cfs_rq) estimated utilization (util_est) is currently used in
dequeue_task_fair() to drive frequency selection before it is updated.

with:

CPU_util : rq->cfs.avg.util_avg
CPU_util_est : rq->cfs.avg.util_est
CPU_utilization : max(CPU_util, CPU_util_est)
task_util : p->se.avg.util_avg
task_util_est : p->se.avg.util_est

dequeue_task_fair():

/* (1) CPU_util and task_util update + inform schedutil about
CPU_utilization changes */
for_each_sched_entity() /* 2 loops */
(dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
-> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
-> sugov_get_util() -> cpu_util_cfs()

/* (2) CPU_util_est and task_util_est update */
util_est_dequeue()

cpu_util_cfs() uses CPU_utilization which could lead to a false (too
high) utilization value for schedutil in task ramp-down or ramp-up
scenarios during task dequeue.

To mitigate the issue split the util_est update (2) into:

(A) CPU_util_est update in util_est_dequeue()
(B) task_util_est update in util_est_update()

Place (A) before (1) and keep (B) where (2) is. The latter is necessary
since (B) relies on task_util update in (1).

Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT")
Signed-off-by: Xuewen Yan <xuewen.yan@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Reviewed-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Link: https://lkml.kernel.org/r/1608283672-18240-1-git-send-email-xuewen.yan94@xxxxxxxxx
Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
---
kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++---------------
1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04a3ce20da671..6918adaf74150 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3943,6 +3943,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}

+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
@@ -3956,23 +3972,16 @@ static inline bool within_margin(int value, int margin)
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}

-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
- int cpu;

if (!sched_feat(UTIL_EST))
return;

- /* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
-
- trace_sched_util_est_cfs_tp(cfs_rq);
-
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4012,8 +4021,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;

/*
@@ -4096,8 +4104,11 @@ static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}

static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}

#endif /* CONFIG_SMP */
@@ -5609,6 +5620,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);

+ util_est_dequeue(&rq->cfs, p);
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -5659,7 +5672,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;

dequeue_throttle:
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}

--
2.27.0