On 07/05/2024 14:50, Hongyan Xia wrote:
Add a util_avg_bias signal in sched_avg, which is obtained by:
util_avg_bias = clamp(util_avg, uclamp_min, uclamp_max) - util_avg
The task utilization after considering uclamp is;
util_avg_uclamp = util_avg + util_avg_bias
We then sum up all biases on the same rq and use the total bias to bias
the rq utilization. This is the core idea of uclamp sum aggregation. The
rq utilization will be
rq_util_avg_uclamp = rq_util_avg + total_util_avg_bias
Signed-off-by: Hongyan Xia <hongyan.xia2@xxxxxxx>
---
include/linux/sched.h | 4 +++-
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 16 ++++++++++++++++
kernel/sched/pelt.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 28 ++++++++++++++++++++++++++++
5 files changed, 87 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c75fd46506df..4ea4b8b30f54 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -476,8 +476,10 @@ struct sched_avg {
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_avg;
- unsigned long util_avg;
+ unsigned int util_avg;
+ int util_avg_bias;
unsigned int util_est;
+ unsigned int util_est_uclamp;
Looks like you only introduce uclamped util_est functions in 3/6. It's
easy to grasp when data changes go together with new functions. Maybe
introduce a specific util_est patch before 3/6 "Use util biases for
utilization and frequency"?
} ____cacheline_aligned;
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a5834d..c4dadb740e96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -682,7 +682,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
cfs_rq->avg.runnable_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ SEQ_printf(m, " .%-30s: %u\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est",
cfs_rq->avg.util_est);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef5bb576ac65..571c8de59508 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1083,6 +1083,7 @@ void post_init_entity_util_avg(struct task_struct *p)
}
sa->runnable_avg = sa->util_avg;
+ sa->util_avg_bias = 0;
}
#else /* !CONFIG_SMP */
@@ -6801,6 +6802,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
+ util_bias_enqueue(&rq->cfs.avg, p);
+ /* XXX: We should skip the update above and only do it once here. */
By 'above' you're referring to the update in enqueue_entity() ->
update_load_avg(..., DO_ATTACH) -> attach_entity_load_avg() ->
cfs_rq_util_change() ?
I assume you won't have the problem of having to add a
cpufreq_update_util(rq, 0) call after the util_bias enqueue or dequeue
with "[PATCH v4] sched: Consolidate cpufreq updates"
https://lkml.kernel.org/r/20240516204802.846520-1-qyousef@xxxxxxxxxxx
anymore?
+ cpufreq_update_util(rq, 0);
/*
* Since new tasks are assigned an initial util_avg equal to
@@ -6892,6 +6896,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
+ util_bias_dequeue(&rq->cfs.avg, p);
+ /* XXX: We should skip the update above and only do it once here. */
+ cpufreq_update_util(rq, 0);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -6900,6 +6907,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+ if (rq->cfs.h_nr_running == 0) {
+ WARN_ONCE(rq->cfs.avg.util_avg_bias,
+ "0 tasks on CFS of CPU %d, but util_avg_bias is %d\n",
+ rq->cpu, rq->cfs.avg.util_avg_bias);
+ WRITE_ONCE(rq->cfs.avg.util_avg_bias, 0);
+ }
+#endif
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index ef00382de595..56346988182f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
You could pass a `struct rq *rq` here? I assume this code is already
there to include rt-tasks (via per-entity load-tracking in rt class)?
+{
+ unsigned int util, uclamp_min, uclamp_max;
+ int old, new;
+
+ /*
+ * We MUST update the rq summed total every time we update the
+ * util_avg_bias of a task. If we are on a rq but we are not given the
+ * rq, then the best thing is to just skip this update.
+ */
+ if (p->se.on_rq && !avg)
+ return;
+
+ util = READ_ONCE(p->se.avg.util_avg);
+ uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ if (uclamp_max == SCHED_CAPACITY_SCALE)
+ uclamp_max = UINT_MAX;
This is done to not cap a task util_avg > 1024 in case the task has
default uclamp_max?
+ old = READ_ONCE(p->se.avg.util_avg_bias);
+ new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+
+ WRITE_ONCE(p->se.avg.util_avg_bias, new);
+ if (!p->se.on_rq)
+ return;
+ WRITE_ONCE(avg->util_avg_bias, READ_ONCE(avg->util_avg_bias) + new - old);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
+{
+}
+#endif
+
/*
* sched_entity:
*
@@ -296,6 +330,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se));
+ if (entity_is_task(se))
+ update_util_bias(NULL, task_of(se));
IMHO,
update_util_bias(struct sched_avg *avg, struct sched_entity *se)
if (!entity_is_task(se))
return;
...
would be easier to read.
trace_pelt_se_tp(se);
return 1;
}
@@ -310,6 +346,9 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
+ if (entity_is_task(se))
+ update_util_bias(&rq_of(cfs_rq)->cfs.avg,
+ task_of(se));
trace_pelt_se_tp(se);
return 1;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb3792c04eea..aec812e6c6ba 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3095,6 +3095,24 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
+
+static inline void util_bias_enqueue(struct sched_avg *avg,
+ struct task_struct *p)
+{
+ int avg_val = READ_ONCE(avg->util_avg_bias);
+ int p_val = READ_ONCE(p->se.avg.util_avg_bias);
+
+ WRITE_ONCE(avg->util_avg_bias, avg_val + p_val);
+}
+
+static inline void util_bias_dequeue(struct sched_avg *avg,
+ struct task_struct *p)
+{
+ int avg_val = READ_ONCE(avg->util_avg_bias);
+ int p_val = READ_ONCE(p->se.avg.util_avg_bias);
+
+ WRITE_ONCE(avg->util_avg_bias, avg_val - p_val);
+}
Maybe enqueue_util_bias() and dequeue_util_bias() since there is the
related update_util_bias()? I know that there is util_est_enqueue() but
util_est also has util_est_update().
[...]