[PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

From: Qais Yousef
Date: Thu Dec 07 2023 - 20:53:05 EST

Next message: Qais Yousef: "[PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks"
Previous message: Jie Hai: "Re: [PATCH] dmaengine: virt-dma : fix vchan error on multi-thread"
In reply to: Qais Yousef: "[PATCH 0/4] sched: cpufreq: Remove uclamp max-aggregation"
Next in thread: Qais Yousef: "[PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Due to the way code is structured, it makes a lot of sense to trigger
cpufreq_update_util() from update_load_avg(). But this is too aggressive
as in most cases we are iterating through entities in a loop to
update_load_avg() in the hierarchy. So we end up sending too many
request in an loop as we're updating the hierarchy.

Combine this with the rate limit in schedutil, we could end up
prematurely send up a wrong frequency update before we have actually
updated all entities appropriately.

Be smarter about it by limiting the trigger to perform frequency updates
after all accounting logic has done. This ended up being in the
following points:

1. enqueue/dequeue_task_fair()
2. throttle/unthrottle_cfs_rq()
3. attach/detach_task_cfs_rq()
4. task_tick_fair()
5. __sched_group_set_shares()

This is not 100% ideal still due to other limitations that might be
a bit harder to handle. Namely we can end up with premature update
request in the following situations:

a. Simultaneous task enqueue on the CPU where 2nd task is bigger and
requires higher freq. The trigger to cpufreq_update_util() by the
first task will lead to dropping the 2nd request until tick. Or
another CPU in the same policy trigger a freq update.

b. CPUs sharing a policy can end up with the same race in a but the
simultaneous enqueue happens on different CPUs in the same policy.

The above though are limitations in the governor/hardware, and from
scheduler point of view at least that's the best we can do. The
governor might consider smarter logic to aggregate near simultaneous
request and honour the higher one.

Signed-off-by: Qais Yousef (Google) <qyousef@xxxxxxxxxxx>
---
kernel/sched/fair.c | 55 ++++++++++++---------------------------------
1 file changed, 14 insertions(+), 41 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b83448be3f79..f99910fc6705 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3997,29 +3997,6 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */

-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
-{
- struct rq *rq = rq_of(cfs_rq);
-
- if (&rq->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util_cfs().
- */
- cpufreq_update_util(rq, flags);
- }
-}
-
#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
@@ -4648,8 +4625,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);

- cfs_rq_util_change(cfs_rq, 0);
-
trace_pelt_cfs_tp(cfs_rq);
}

@@ -4678,8 +4653,6 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

- cfs_rq_util_change(cfs_rq, 0);
-
trace_pelt_cfs_tp(cfs_rq);
}

@@ -4726,11 +4699,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
- } else if (decayed) {
- cfs_rq_util_change(cfs_rq, 0);
-
- if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq);
+ } else if (decayed && (flags & UPDATE_TG)) {
+ update_tg_load_avg(cfs_rq);
}
}

@@ -5114,7 +5084,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)

static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cfs_rq_util_change(cfs_rq, 0);
}

static inline void remove_entity_load_avg(struct sched_entity *se) {}
@@ -5807,6 +5776,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
sub_nr_running(rq, task_delta);

done:
+ cpufreq_update_util(rq, 0);
+
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5899,6 +5870,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);

+ cpufreq_update_util(rq, 0);
+
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
@@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);

- /*
- * If in_iowait is set, the code below may not trigger any cpufreq
- * utilization updates, so do it here explicitly with the IOWAIT flag
- * passed.
- */
- if (p->in_iowait)
- cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
-
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

+ cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
+
hrtick_update(rq);
}

@@ -6849,6 +6816,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
+ cpufreq_update_util(rq, 0);
hrtick_update(rq);
}

@@ -8482,6 +8450,7 @@ done: __maybe_unused;

update_misfit_status(p, rq);
sched_fair_update_stop_tick(rq, p);
+ cpufreq_update_util(rq, 0);

return p;

@@ -12615,6 +12584,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+ cpufreq_update_util(rq, 0);

task_tick_core(rq, curr);
}
@@ -12739,6 +12709,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;

detach_entity_cfs_rq(se);
+ cpufreq_update_util(task_rq(p), 0);
}

static void attach_task_cfs_rq(struct task_struct *p)
@@ -12746,6 +12717,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;

attach_entity_cfs_rq(se);
+ cpufreq_update_util(task_rq(p), 0);
}

static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12991,6 +12963,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
update_cfs_group(se);
}
+ cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}

--
2.34.1

Next message: Qais Yousef: "[PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks"
Previous message: Jie Hai: "Re: [PATCH] dmaengine: virt-dma : fix vchan error on multi-thread"
In reply to: Qais Yousef: "[PATCH 0/4] sched: cpufreq: Remove uclamp max-aggregation"
Next in thread: Qais Yousef: "[PATCH 3/4] sched/schedutil: Ignore update requests for short running tasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]