[PATCH 1/2] sched/fair: pelt: use u32 for util_avg

From: Patrick Bellasi
Date: Mon Jun 04 2018 - 12:06:22 EST


The util_avg signal is used to track the utilization (i.e. RUNNING time)
of SEs and RQs. Its values are computed according to the PELT algorithm
and thus, for SE, they are bounded to an (internal) representation which
uses 20bits. For RQ instead they are technically un-bounded, since when
tasks are migrated across RQs we sum their utilization to the
destination RQ.

We currently use an unsigned long to track util_avg which maps into a
64bits storage on 64bits systems. However, 32bits should be good enough
for all practical usages. Indeed, even for RQs, the remaining 12bits
allows to track up to 4K 100% tasks concurrently RUNNABLE on a single
CPU.

Since the sched_avg data structure already completely fits a 64B cache
line, let's get back 4B by using u32 to track util_avg. The recovered
space could be conveniently used to fit other load tracking related
metrics into the same cache line.

Signed-off-by: Patrick Bellasi <patrick.bellasi@xxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Todd Kjos <tkjos@xxxxxxxxxx>
Cc: Joel Fernandes <joelaf@xxxxxxxxxx>
Cc: Steve Muckle <smuckle@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Morten Rasmussen <morten.rasmussen@xxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-pm@xxxxxxxxxxxxxxx
---
include/linux/sched.h | 2 +-
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 17 ++++++++++-------
3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 28ff3ca9f752..9d8732dab264 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,7 +402,7 @@ struct sched_avg {
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_load_avg;
- unsigned long util_avg;
+ u32 util_avg;
struct util_est util_est;
} ____cacheline_aligned;

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 15b10e210a6b..a985789eeb9c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -541,7 +541,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->avg.runnable_load_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ SEQ_printf(m, " .%-30s: %u\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
cfs_rq->avg.util_est.enqueued);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e497c05aab7f..f74441be3f44 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -750,19 +750,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
void post_init_entity_util_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;

if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+ struct sched_avg *sa = &se->avg;
+ u64 util_avg = READ_ONCE(sa->util_avg);

- if (sa->util_avg > cap)
- sa->util_avg = cap;
+ if (cfs_rq->avg.util_avg != 0) {
+ util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ util_avg /= (cfs_rq->avg.load_avg + 1);
+ if (util_avg > cap)
+ util_avg = cap;
} else {
- sa->util_avg = cap;
+ util_avg = cap;
}
+
+ WRITE_ONCE(sa->util_avg, util_avg);
}

if (entity_is_task(se)) {
--
2.15.1