Re: [RFC PATCH v3 2/6] sched/uclamp: Track a new util_avg_bias signal

From: Hongyan Xia
Date: Tue May 28 2024 - 07:09:53 EST


On 26/05/2024 23:52, Dietmar Eggemann wrote:
On 07/05/2024 14:50, Hongyan Xia wrote:
Add a util_avg_bias signal in sched_avg, which is obtained by:

util_avg_bias = clamp(util_avg, uclamp_min, uclamp_max) - util_avg

The task utilization after considering uclamp is;

util_avg_uclamp = util_avg + util_avg_bias

We then sum up all biases on the same rq and use the total bias to bias
the rq utilization. This is the core idea of uclamp sum aggregation. The
rq utilization will be

rq_util_avg_uclamp = rq_util_avg + total_util_avg_bias

Signed-off-by: Hongyan Xia <hongyan.xia2@xxxxxxx>
---
include/linux/sched.h | 4 +++-
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 16 ++++++++++++++++
kernel/sched/pelt.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 28 ++++++++++++++++++++++++++++
5 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c75fd46506df..4ea4b8b30f54 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -476,8 +476,10 @@ struct sched_avg {
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_avg;
- unsigned long util_avg;
+ unsigned int util_avg;
+ int util_avg_bias;
unsigned int util_est;
+ unsigned int util_est_uclamp;

Looks like you only introduce uclamped util_est functions in 3/6. It's
easy to grasp when data changes go together with new functions. Maybe
introduce a specific util_est patch before 3/6 "Use util biases for
utilization and frequency"?

Makes sense. I can do that in the next rev.


} ____cacheline_aligned;
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a5834d..c4dadb740e96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -682,7 +682,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
cfs_rq->avg.runnable_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ SEQ_printf(m, " .%-30s: %u\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est",
cfs_rq->avg.util_est);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef5bb576ac65..571c8de59508 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1083,6 +1083,7 @@ void post_init_entity_util_avg(struct task_struct *p)
}
sa->runnable_avg = sa->util_avg;
+ sa->util_avg_bias = 0;
}
#else /* !CONFIG_SMP */
@@ -6801,6 +6802,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
+ util_bias_enqueue(&rq->cfs.avg, p);
+ /* XXX: We should skip the update above and only do it once here. */

By 'above' you're referring to the update in enqueue_entity() ->
update_load_avg(..., DO_ATTACH) -> attach_entity_load_avg() ->
cfs_rq_util_change() ?

I assume you won't have the problem of having to add a
cpufreq_update_util(rq, 0) call after the util_bias enqueue or dequeue
with "[PATCH v4] sched: Consolidate cpufreq updates"
https://lkml.kernel.org/r/20240516204802.846520-1-qyousef@xxxxxxxxxxx
anymore?

Yes. That series would solve this problem nicely.

+ cpufreq_update_util(rq, 0);
/*
* Since new tasks are assigned an initial util_avg equal to
@@ -6892,6 +6896,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
+ util_bias_dequeue(&rq->cfs.avg, p);
+ /* XXX: We should skip the update above and only do it once here. */
+ cpufreq_update_util(rq, 0);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -6900,6 +6907,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+ if (rq->cfs.h_nr_running == 0) {
+ WARN_ONCE(rq->cfs.avg.util_avg_bias,
+ "0 tasks on CFS of CPU %d, but util_avg_bias is %d\n",
+ rq->cpu, rq->cfs.avg.util_avg_bias);
+ WRITE_ONCE(rq->cfs.avg.util_avg_bias, 0);
+ }
+#endif
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index ef00382de595..56346988182f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+static void update_util_bias(struct sched_avg *avg, struct task_struct *p)

You could pass a `struct rq *rq` here? I assume this code is already
there to include rt-tasks (via per-entity load-tracking in rt class)?

The intention is definitely to make things easier for RT tasks later. We can do CFS like:

update_util_bias(&rq->cfs.avg, p);

and then do RT like this:

update_util_bias(&rq->avg_rt, p);

If the signature is update_util_bias(rq, p), then inside this function we need to condition on the type of p to know whether we want to fetch &rq->cfs.avg or &rq->avg_rt, and I think the former idea is simpler.

Unless if I misunderstood something?

+{
+ unsigned int util, uclamp_min, uclamp_max;
+ int old, new;
+
+ /*
+ * We MUST update the rq summed total every time we update the
+ * util_avg_bias of a task. If we are on a rq but we are not given the
+ * rq, then the best thing is to just skip this update.
+ */
+ if (p->se.on_rq && !avg)
+ return;
+
+ util = READ_ONCE(p->se.avg.util_avg);
+ uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ if (uclamp_max == SCHED_CAPACITY_SCALE)
+ uclamp_max = UINT_MAX;

This is done to not cap a task util_avg > 1024 in case the task has
default uclamp_max?

Yes. In some corner cases you can end up with util_avg > 1024. If the task has the default uclamp_max of 1024, it certainly doesn't mean we want to have a negative bias here.

I can add some comments.

+ old = READ_ONCE(p->se.avg.util_avg_bias);
+ new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+
+ WRITE_ONCE(p->se.avg.util_avg_bias, new);
+ if (!p->se.on_rq)
+ return;
+ WRITE_ONCE(avg->util_avg_bias, READ_ONCE(avg->util_avg_bias) + new - old);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
+{
+}
+#endif
+
/*
* sched_entity:
*
@@ -296,6 +330,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se));
+ if (entity_is_task(se))
+ update_util_bias(NULL, task_of(se));

IMHO,

update_util_bias(struct sched_avg *avg, struct sched_entity *se)

if (!entity_is_task(se))
return;

...

would be easier to read.

Yeah, that would work, and might be a good way to prepare for group clamping if it ever becomes a thing.

trace_pelt_se_tp(se);
return 1;
}
@@ -310,6 +346,9 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
+ if (entity_is_task(se))
+ update_util_bias(&rq_of(cfs_rq)->cfs.avg,
+ task_of(se));
trace_pelt_se_tp(se);
return 1;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb3792c04eea..aec812e6c6ba 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3095,6 +3095,24 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
+
+static inline void util_bias_enqueue(struct sched_avg *avg,
+ struct task_struct *p)
+{
+ int avg_val = READ_ONCE(avg->util_avg_bias);
+ int p_val = READ_ONCE(p->se.avg.util_avg_bias);
+
+ WRITE_ONCE(avg->util_avg_bias, avg_val + p_val);
+}
+
+static inline void util_bias_dequeue(struct sched_avg *avg,
+ struct task_struct *p)
+{
+ int avg_val = READ_ONCE(avg->util_avg_bias);
+ int p_val = READ_ONCE(p->se.avg.util_avg_bias);
+
+ WRITE_ONCE(avg->util_avg_bias, avg_val - p_val);
+}

Maybe enqueue_util_bias() and dequeue_util_bias() since there is the
related update_util_bias()? I know that there is util_est_enqueue() but
util_est also has util_est_update().

I don't have much of a strong preference here, so either works for me.

[...]