[RFC PATCH v2 2/7] sched/uclamp: Track uclamped util_avg in sched_avg

From: Hongyan Xia
Date: Thu Feb 01 2024 - 08:12:48 EST


Track a uclamped version of util_avg in sched_avg, which clamps util_avg
within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
updated. At the root CFS rq level, just like util_est,
rq->cfs.avg.util_avg_uclamp must always be the sum of all
util_avg_uclamp of CFS tasks on this rq. So, each time the
util_avg_uclamp of a task gets updated, we also track the delta and
update the root cfs_rq. When a CFS task gets enqueued or dequeued, the
rq->cfs.avg.util_avg_uclamp also needs to add or subtract the
util_avg_uclamp of this task.

Signed-off-by: Hongyan Xia <hongyan.xia2@xxxxxxx>
---
include/linux/sched.h | 3 +++
kernel/sched/fair.c | 21 +++++++++++++++++++
kernel/sched/pelt.c | 48 +++++++++++++++++++++++++++++++++++--------
kernel/sched/pelt.h | 5 +++--
kernel/sched/sched.h | 27 ++++++++++++++++++++++++
5 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03bfe9ab2951..f28eeff169ff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -470,6 +470,9 @@ struct sched_avg {
unsigned long runnable_avg;
unsigned long util_avg;
unsigned int util_est;
+#ifdef CONFIG_UCLAMP_TASK
+ unsigned int util_avg_uclamp;
+#endif
} ____cacheline_aligned;

/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d5cc87db4845..4f535c96463b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1089,6 +1089,9 @@ void post_init_entity_util_avg(struct task_struct *p)
}

sa->runnable_avg = sa->util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+ sa->util_avg_uclamp = sa->util_avg;
+#endif
}

#else /* !CONFIG_SMP */
@@ -6763,6 +6766,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
+#ifdef CONFIG_UCLAMP_TASK
+ util_uclamp_enqueue(&rq->cfs.avg, p);
+ update_util_uclamp(0, 0, 0, &rq->cfs.avg, p);
+ /* TODO: Better skip the frequency update in the for loop above. */
+ cpufreq_update_util(rq, 0);
+#endif

/*
* Since new tasks are assigned an initial util_avg equal to
@@ -6854,6 +6863,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
+#ifdef CONFIG_UCLAMP_TASK
+ util_uclamp_dequeue(&rq->cfs.avg, p);
+#endif

/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -6862,6 +6874,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+ if (rq->cfs.h_nr_running == 0) {
+ WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
+ "0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
+ rq->cpu, rq->cfs.avg.util_avg_uclamp);
+ WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
+ }
+#endif
}

#ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 63b6cf898220..eca45a863f9f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,39 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}

+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+void update_util_uclamp(struct sched_avg *avg, struct task_struct *p)
+{
+ unsigned int util, uclamp_min, uclamp_max;
+ int delta;
+
+ if (!p->se.on_rq)
+ return;
+
+ if (!avg)
+ return;
+
+ util = READ_ONCE(p->se.avg.util_avg);
+ uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ util = clamp(util, uclamp_min, uclamp_max);
+
+ delta = util - READ_ONCE(p->se.avg.util_avg_uclamp);
+ if (delta == 0)
+ return;
+
+ WRITE_ONCE(p->se.avg.util_avg_uclamp, util);
+ util = READ_ONCE(avg->util_avg_uclamp);
+ util += delta;
+ WRITE_ONCE(avg->util_avg_uclamp, util);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+void update_util_uclamp(struct sched_avg *avg, struct task_struct *p)
+{
+}
+#endif
+
/*
* sched_entity:
*
@@ -292,29 +325,28 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
* load_avg = \Sum se->avg.load_avg
*/

-int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
+void __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se));
+ if (entity_is_task(se))
+ update_util_uclamp(NULL, task_of(se));
trace_pelt_se_tp(se);
- return 1;
}
-
- return 0;
}

-int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
+void __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
cfs_rq->curr == se)) {

___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
+ if (entity_is_task(se))
+ update_util_uclamp(&rq_of(cfs_rq)->cfs.avg,
+ task_of(se));
trace_pelt_se_tp(se);
- return 1;
}
-
- return 0;
}

int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465fbc..6862f79e0fcd 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
#ifdef CONFIG_SMP
#include "sched-pelt.h"

-int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
-int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+void update_util_uclamp(struct sched_avg *avg, struct task_struct *p);
+void __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+void __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..35036246824b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3085,6 +3085,33 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
+
+static inline void util_uclamp_enqueue(struct sched_avg *avg,
+ struct task_struct *p)
+{
+ unsigned int avg_val = READ_ONCE(avg->util_avg_uclamp);
+ unsigned int p_val = READ_ONCE(p->se.avg.util_avg_uclamp);
+
+ WRITE_ONCE(avg->util_avg_uclamp, avg_val + p_val);
+}
+
+static inline void util_uclamp_dequeue(struct sched_avg *avg,
+ struct task_struct *p)
+{
+ unsigned int avg_val = READ_ONCE(avg->util_avg_uclamp);
+ unsigned int p_val = READ_ONCE(p->se.avg.util_avg_uclamp), new_val;
+
+ if (avg_val > p_val)
+ new_val = avg_val - p_val;
+ else {
+ WARN_ONCE(avg_val < p_val,
+ "avg_val underflow. avg_val %u is even less than p_val %u before subtraction\n",
+ avg_val, p_val);
+ new_val = 0;
+ }
+
+ WRITE_ONCE(avg->util_avg_uclamp, new_val);
+}
#else /* CONFIG_UCLAMP_TASK */
static inline unsigned long uclamp_eff_value(struct task_struct *p,
enum uclamp_id clamp_id)
--
2.34.1