Re: [RFC PATCH v3 2/6] sched/uclamp: Track a new util_avg_bias signal

From: Dietmar Eggemann
Date: Sun May 26 2024 - 18:52:52 EST


On 07/05/2024 14:50, Hongyan Xia wrote:
> Add a util_avg_bias signal in sched_avg, which is obtained by:
>
> util_avg_bias = clamp(util_avg, uclamp_min, uclamp_max) - util_avg
>
> The task utilization after considering uclamp is;
>
> util_avg_uclamp = util_avg + util_avg_bias
>
> We then sum up all biases on the same rq and use the total bias to bias
> the rq utilization. This is the core idea of uclamp sum aggregation. The
> rq utilization will be
>
> rq_util_avg_uclamp = rq_util_avg + total_util_avg_bias
>
> Signed-off-by: Hongyan Xia <hongyan.xia2@xxxxxxx>
> ---
> include/linux/sched.h | 4 +++-
> kernel/sched/debug.c | 2 +-
> kernel/sched/fair.c | 16 ++++++++++++++++
> kernel/sched/pelt.c | 39 +++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 28 ++++++++++++++++++++++++++++
> 5 files changed, 87 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c75fd46506df..4ea4b8b30f54 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -476,8 +476,10 @@ struct sched_avg {
> u32 period_contrib;
> unsigned long load_avg;
> unsigned long runnable_avg;
> - unsigned long util_avg;
> + unsigned int util_avg;
> + int util_avg_bias;
> unsigned int util_est;
> + unsigned int util_est_uclamp;

Looks like you only introduce uclamped util_est functions in 3/6. It's
easy to grasp when data changes go together with new functions. Maybe
introduce a specific util_est patch before 3/6 "Use util biases for
utilization and frequency"?


> } ____cacheline_aligned;
>
> /*
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 8d5d98a5834d..c4dadb740e96 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -682,7 +682,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> cfs_rq->avg.load_avg);
> SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
> cfs_rq->avg.runnable_avg);
> - SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
> + SEQ_printf(m, " .%-30s: %u\n", "util_avg",
> cfs_rq->avg.util_avg);
> SEQ_printf(m, " .%-30s: %u\n", "util_est",
> cfs_rq->avg.util_est);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ef5bb576ac65..571c8de59508 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1083,6 +1083,7 @@ void post_init_entity_util_avg(struct task_struct *p)
> }
>
> sa->runnable_avg = sa->util_avg;
> + sa->util_avg_bias = 0;
> }
>
> #else /* !CONFIG_SMP */
> @@ -6801,6 +6802,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> /* At this point se is NULL and we are at root level*/
> add_nr_running(rq, 1);
> + util_bias_enqueue(&rq->cfs.avg, p);
> + /* XXX: We should skip the update above and only do it once here. */

By 'above' you're referring to the update in enqueue_entity() ->
update_load_avg(..., DO_ATTACH) -> attach_entity_load_avg() ->
cfs_rq_util_change() ?

I assume you won't have the problem of having to add a
cpufreq_update_util(rq, 0) call after the util_bias enqueue or dequeue
with "[PATCH v4] sched: Consolidate cpufreq updates"
https://lkml.kernel.org/r/20240516204802.846520-1-qyousef@xxxxxxxxxxx
anymore?

> + cpufreq_update_util(rq, 0);
>
> /*
> * Since new tasks are assigned an initial util_avg equal to
> @@ -6892,6 +6896,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> /* At this point se is NULL and we are at root level*/
> sub_nr_running(rq, 1);
> + util_bias_dequeue(&rq->cfs.avg, p);
> + /* XXX: We should skip the update above and only do it once here. */
> + cpufreq_update_util(rq, 0);
>
> /* balance early to pull high priority tasks */
> if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
> @@ -6900,6 +6907,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> dequeue_throttle:
> util_est_update(&rq->cfs, p, task_sleep);
> hrtick_update(rq);
> +
> +#ifdef CONFIG_UCLAMP_TASK
> + if (rq->cfs.h_nr_running == 0) {
> + WARN_ONCE(rq->cfs.avg.util_avg_bias,
> + "0 tasks on CFS of CPU %d, but util_avg_bias is %d\n",
> + rq->cpu, rq->cfs.avg.util_avg_bias);
> + WRITE_ONCE(rq->cfs.avg.util_avg_bias, 0);
> + }
> +#endif
> }
>
> #ifdef CONFIG_SMP
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index ef00382de595..56346988182f 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -266,6 +266,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
> WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
> }
>
> +#ifdef CONFIG_UCLAMP_TASK
> +/* avg must belong to the queue this se is on. */
> +static void update_util_bias(struct sched_avg *avg, struct task_struct *p)

You could pass a `struct rq *rq` here? I assume this code is already
there to include rt-tasks (via per-entity load-tracking in rt class)?

> +{
> + unsigned int util, uclamp_min, uclamp_max;
> + int old, new;
> +
> + /*
> + * We MUST update the rq summed total every time we update the
> + * util_avg_bias of a task. If we are on a rq but we are not given the
> + * rq, then the best thing is to just skip this update.
> + */
> + if (p->se.on_rq && !avg)
> + return;
> +
> + util = READ_ONCE(p->se.avg.util_avg);
> + uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> + uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> + if (uclamp_max == SCHED_CAPACITY_SCALE)
> + uclamp_max = UINT_MAX;

This is done to not cap a task util_avg > 1024 in case the task has
default uclamp_max?

> + old = READ_ONCE(p->se.avg.util_avg_bias);
> + new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
> +
> + WRITE_ONCE(p->se.avg.util_avg_bias, new);
> + if (!p->se.on_rq)
> + return;
> + WRITE_ONCE(avg->util_avg_bias, READ_ONCE(avg->util_avg_bias) + new - old);
> +}
> +#else /* !CONFIG_UCLAMP_TASK */
> +static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
> +{
> +}
> +#endif
> +
> /*
> * sched_entity:
> *
> @@ -296,6 +330,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
> {
> if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
> ___update_load_avg(&se->avg, se_weight(se));
> + if (entity_is_task(se))
> + update_util_bias(NULL, task_of(se));

IMHO,

update_util_bias(struct sched_avg *avg, struct sched_entity *se)

if (!entity_is_task(se))
return;

...

would be easier to read.

> trace_pelt_se_tp(se);
> return 1;
> }
> @@ -310,6 +346,9 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
>
> ___update_load_avg(&se->avg, se_weight(se));
> cfs_se_util_change(&se->avg);
> + if (entity_is_task(se))
> + update_util_bias(&rq_of(cfs_rq)->cfs.avg,
> + task_of(se));
> trace_pelt_se_tp(se);
> return 1;
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index cb3792c04eea..aec812e6c6ba 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3095,6 +3095,24 @@ static inline bool uclamp_is_used(void)
> {
> return static_branch_likely(&sched_uclamp_used);
> }
> +
> +static inline void util_bias_enqueue(struct sched_avg *avg,
> + struct task_struct *p)
> +{
> + int avg_val = READ_ONCE(avg->util_avg_bias);
> + int p_val = READ_ONCE(p->se.avg.util_avg_bias);
> +
> + WRITE_ONCE(avg->util_avg_bias, avg_val + p_val);
> +}
> +
> +static inline void util_bias_dequeue(struct sched_avg *avg,
> + struct task_struct *p)
> +{
> + int avg_val = READ_ONCE(avg->util_avg_bias);
> + int p_val = READ_ONCE(p->se.avg.util_avg_bias);
> +
> + WRITE_ONCE(avg->util_avg_bias, avg_val - p_val);
> +}

Maybe enqueue_util_bias() and dequeue_util_bias() since there is the
related update_util_bias()? I know that there is util_est_enqueue() but
util_est also has util_est_update().

[...]