[PATCH 4/4] sched/fair: Implement flat hierarchical structure for util_avg

From: Yuyang Du
Date: Mon Apr 11 2016 - 02:18:51 EST


The utilization part of the sched averages follows task group's
hierarchy, but the utils of all group entities and all cfs_rqs except
the top cfs_rq are needless to update and are never used. And more
importantly, the top cfs_rq's util will not reflect migration of a
group task.

So this patch propose a flat task hierarchy for util - all tasks
affiliated to a rq are flat, ignoring their task group levels, and
the rq's util is the sum of all the tasks.

Overhead wise, the rg's util update may be more costly, but we don't
update any group entity's util, so the net overhead should not be
formidable. In addition, rq's util updates can be very frequent,
but the updates in a period will be skipped, this is mostly effective
in update_blocked_averages().

Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
kernel/sched/debug.c | 11 ++---
kernel/sched/fair.c | 123 +++++++++++++++++++++++++++++++--------------------
kernel/sched/sched.h | 5 ++-
3 files changed, 85 insertions(+), 54 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fbc3bd..ba08d1c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -395,7 +395,6 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
- P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -469,6 +468,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
print_task(m, rq, p);
}
rcu_read_unlock();
+
+ SEQ_printf(m, "\nutilization: \n");
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&rq->removed_util_avg));
}

void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -517,12 +522,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
- cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
atomic_long_read(&cfs_rq->removed_load_avg));
- SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
- atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 49e9f1a..67c2730 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -686,45 +686,27 @@ void init_entity_runnable_average(struct sched_entity *se)

/*
* With new tasks being created, their initial util_avgs are extrapolated
- * based on the cfs_rq's current util_avg:
+ * based on the rq's current util_avg. To make the util_avg converge, we
+ * cap the util_avg of successive tasks to only 1/2 of the left utilization
+ * budget:
*
- * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
- *
- * However, in many cases, the above util_avg does not give a desired
- * value. Moreover, the sum of the util_avgs may be divergent, such
- * as when the series is a harmonic series.
- *
- * To solve this problem, we also cap the util_avg of successive tasks to
- * only 1/2 of the left utilization budget:
- *
- * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ * util_avg = (1024 - rq->avg.util_avg) / 2^n
*
* where n denotes the nth task.
*
* For example, a simplest series from the beginning would be like:
*
- * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
- * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
- *
- * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
- * if util_avg > util_avg_cap.
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
*/
void post_init_entity_util_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct rq *rq = rq_of(cfs_rq_of(se));
struct sched_avg *sa = &se->avg;
- long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+ long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - rq->avg.util_avg) / 2;

if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
+ sa->util_avg = cap;
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
}
@@ -2697,6 +2679,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
u64 delta;
u32 contrib, periods;
unsigned long scale_freq, scale_cpu;
+ int update_util = 0;

/*
* now rolls down to a period boundary
@@ -2720,15 +2703,19 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return 0;
sa->last_update_time = now;

+ /*
+ * We update util_sum together with load_sum only if it is a task
+ */
+ if (!cfs_rq && entity_is_task(container_of(sa, struct sched_entity, avg)))
+ update_util = 1;
+
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);

/*
* Now we know we're crossing period boundaries, and the *_sum accrues by
* two steps.
- */
-
- /*
+ *
* Step 1: decay old *_sum
*/
sa->load_sum = __decay_sum(sa->load_sum, periods);
@@ -2736,7 +2723,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
cfs_rq->runnable_load_sum =
__decay_sum(cfs_rq->runnable_load_sum, periods);
}
- sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+ if (update_util)
+ sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);

/*
* Step 2: accumulate and meanwhile decay new *_sum by periods since
@@ -2749,7 +2737,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
- if (running)
+ if (update_util && running)
sa->util_sum += contrib * scale_cpu;

/*
@@ -2760,6 +2748,41 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
cfs_rq->runnable_load_avg =
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ if (update_util)
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+
+ if (!cfs_rq)
+ return 1;
+
+ /*
+ * Update rq's util_sum and util_avg
+ */
+ sa = &rq_of(cfs_rq)->avg;
+
+ /*
+ * Of course, we have new delta and periods.
+ */
+ delta = now - sa->last_update_time;
+ periods = delta >> 20;
+
+ /*
+ * The rq's util should be updated much more frequently
+ * than any cfs_rq. If we are here, child cfs_rq should have
+ * already updated load_avg, so we return 1 anyway.
+ */
+ if (!periods)
+ return 1;
+ sa->last_update_time = now;
+
+ /* Step 1 */
+ sa->load_sum = __decay_sum(sa->load_sum, periods);
+
+ /* Step 2 */
+ contrib = __accumulate_sum(periods);
+ contrib = cap_scale(contrib, scale_freq);
+ if (cfs_rq->curr != NULL) /* new running */
+ sa->util_sum += contrib * scale_cpu;
+
sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

return 1;
@@ -2843,6 +2866,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed = 0;
+ struct rq *rq = rq_of(cfs_rq);

if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2851,10 +2875,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
removed = 1;
}

- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sa->util_avg = max_t(long, sa->util_avg - r, 0);
- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ if (atomic_long_read(&rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&rq->removed_util_avg, 0);
+ rq->avg.util_avg = max_t(long, rq->avg.util_avg - r, 0);
+ rq->avg.util_sum = max_t(s32, rq->avg.util_sum - r * LOAD_AVG_MAX, 0);
}

decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2907,12 +2931,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
* See cpu_util().
*/
cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ min(rq->avg.util_avg, max), max);
}
}

static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct rq *rq = rq_of(cfs_rq);
+
if (!sched_feat(ATTACH_AGE_LOAD))
goto skip_aging;

@@ -2934,20 +2960,22 @@ skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
- cfs_rq->avg.util_avg += se->avg.util_avg;
- cfs_rq->avg.util_sum += se->avg.util_sum;
+ rq->avg.util_avg += se->avg.util_avg;
+ rq->avg.util_sum += se->avg.util_sum;
}

static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct rq *rq = rq_of(cfs_rq);
+
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
&se->avg, se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);

cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+ rq->avg.util_avg = max_t(long, rq->avg.util_avg - se->avg.util_avg, 0);
+ rq->avg.util_sum = max_t(s32, rq->avg.util_sum - se->avg.util_sum, 0);
}

/* Add the load generated by se into cfs_rq's load average */
@@ -3017,6 +3045,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct rq *rq = rq_of(cfs_rq);
u64 last_update_time;

/*
@@ -3030,7 +3059,7 @@ void remove_entity_load_avg(struct sched_entity *se)

__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+ atomic_long_add(se->avg.util_avg, &rq->removed_util_avg);
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -5145,7 +5174,7 @@ done:
* compare the utilization with the capacity of the CPU that is available for
* CFS task (ie cpu_capacity).
*
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * rq->avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
@@ -5154,9 +5183,9 @@ done:
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * Nevertheless, rq->avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * rq->avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
@@ -5167,7 +5196,7 @@ done:
*/
static int cpu_util(int cpu)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long util = cpu_rq(cpu)->avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);

return (util >= capacity) ? capacity : util;
@@ -8334,7 +8363,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#endif
#ifdef CONFIG_SMP
atomic_long_set(&cfs_rq->removed_load_avg, 0);
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}

@@ -8399,7 +8427,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
- post_init_entity_util_avg(se);
}

return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a7cbad7..c64f8b8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -390,7 +390,7 @@ struct cfs_rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
#endif
- atomic_long_t removed_load_avg, removed_util_avg;
+ atomic_long_t removed_load_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
@@ -652,6 +652,9 @@ struct rq {

/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
+
+ struct sched_avg avg;
+ atomic_long_t removed_util_avg;
#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
--
2.1.4