[PATCH v1 10/10] sched/fair: Implement flat hierarchical structure for util_avg

From: Yuyang Du
Date: Wed Aug 10 2016 - 14:37:18 EST


The util_avg follows task group's hierarchy to update, but the util_avgs
of all group entities and cfs_rqs except the top cfs_rq are needless and
thus never used. More importantly, the top cfs_rq's util_avg does not
reflect migration of a group task effectively, because the util_avg of
the task is accounted to its direct cfs_rq, and slowly trickle down to
the top cfs_rq.

So this patch proposes a flat task hierarchy for util_avg - all cfs tasks
affiliated to a rq are flat, removing their task group hierarchy, and
therefore the rq's util is the sum of all the cfs tasks.

Regarding overhead, the rq's util updates may be more costly - rq's util
updates can be very frequent, but we don't update any group entity's util,
so the net overhead should be evened out.

Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
kernel/sched/core.c | 1 +
kernel/sched/debug.c | 13 +++--
kernel/sched/fair.c | 140 +++++++++++++++++++++++++++++++-------------------
kernel/sched/sched.h | 5 +-
4 files changed, 101 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 91fe97f9..d215d04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7475,6 +7475,7 @@ void __init sched_init(void)
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
#endif
+ atomic_long_set(&rq->removed_util_avg, 0);
#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a999..14dc121 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -395,7 +395,6 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
- P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -462,6 +461,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
print_task(m, rq, p);
}
rcu_read_unlock();
+
+#ifdef CONFIG_SMP
+ SEQ_printf(m, "\nutilization: \n");
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&rq->removed_util_avg));
+#endif
}

void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -510,12 +517,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
- cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
atomic_long_read(&cfs_rq->removed_load_avg));
- SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
- atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d4640c..b514129 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -703,47 +703,30 @@ static void attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *

/*
* With new tasks being created, their initial util_avgs are extrapolated
- * based on the cfs_rq's current util_avg:
+ * based on the rq's current util_avg. To make the util_avg converge, we
+ * cap the util_avg of successive tasks to only 1/2 of the left utilization
+ * budget:
*
- * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
- *
- * However, in many cases, the above util_avg does not give a desired
- * value. Moreover, the sum of the util_avgs may be divergent, such
- * as when the series is a harmonic series.
- *
- * To solve this problem, we also cap the util_avg of successive tasks to
- * only 1/2 of the left utilization budget:
- *
- * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ * util_avg = (1024 - rq->avg.util_avg) / 2^n
*
* where n denotes the nth task.
*
* For example, a simplest series from the beginning would be like:
*
- * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
- * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
- *
- * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
- * if util_avg > util_avg_cap.
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
*/
void post_init_entity_sched_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct rq *rq = rq_of(cfs_rq);
struct sched_avg *sa = &se->avg;
- long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ long cap = (long)(SCHED_CAPACITY_SCALE - rq->avg.util_avg) / 2;
u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;

if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
+ sa->util_avg = cap;
sa->util_sum = sa->util_avg * SCHED_AVG_MAX;
}

@@ -2676,8 +2659,45 @@ __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)

#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

-static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
- struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+static __always_inline void
+__update_rq_util_avg(struct rq *rq, unsigned long scale_freq, unsigned long scale_cpu)
+{
+ u32 contrib;
+ struct sched_avg *sa = &rq->avg;
+ u64 delta, periods, now = rq_clock_task(rq);
+
+ /*
+ * We have new delta (in ns unit) and periods (in ms unit).
+ */
+ delta = (now - sa->last_update_time) >> 10;
+ if (!delta)
+ return;
+ sa->last_update_time = now;
+
+ delta += sa->period_contrib;
+ periods = delta >> 10;
+
+ /* Step 1: decay */
+ if (periods)
+ sa->util_sum = __decay_sum(sa->util_sum, periods);
+
+ /* Step 2: accumulate */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (rq->cfs.curr != NULL) /* new running */
+ sa->util_sum += contrib * scale_cpu;
+
+ /* Step 3: update avg */
+ if (periods)
+ sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+}
+
+static __always_inline u32
+accumulate_sum(u64 delta, struct sched_avg *sa, struct cfs_rq *cfs_rq, int cpu,
+ unsigned long weight, int running, int update_util)
{
u32 contrib;
u64 periods;
@@ -2696,11 +2716,11 @@ static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = __decay_sum(sa->load_sum, periods);
- if (cfs_rq) {
+ if (cfs_rq)
cfs_rq->runnable_load_sum =
__decay_sum(cfs_rq->runnable_load_sum, periods);
- }
- sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+ else if (update_util)
+ sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
}

/*
@@ -2720,9 +2740,12 @@ static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
- if (running)
+ if (running && update_util)
sa->util_sum += contrib * scale_cpu;

+ if (cfs_rq)
+ __update_rq_util_avg(rq_of(cfs_rq), scale_freq, scale_cpu);
+
return periods;
}

@@ -2759,6 +2782,7 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta;
+ int update_util = 0;

delta = now - sa->last_update_time;
/*
@@ -2780,24 +2804,30 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time = now;

/*
+ * We update util_sum together with load_sum iff it is a task
+ */
+ if (!cfs_rq && entity_is_task(container_of(sa, struct sched_entity, avg)))
+ update_util = 1;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+ if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running, update_util))
return 0;

/*
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
- if (cfs_rq) {
+ if (cfs_rq)
cfs_rq->runnable_load_avg =
div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+ else if (update_util)
+ sa->util_avg = sa->util_sum / SCHED_AVG_MAX;

return 1;
}
@@ -2897,8 +2927,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ cpufreq_update_util(rq_clock(rq), min(rq->avg.util_avg, max), max);
}
}

@@ -2941,18 +2970,21 @@ update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
+ struct rq *rq = rq_of(cfs_rq);

if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * SCHED_AVG_MAX);
removed_load = 1;
}

- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * SCHED_AVG_MAX);
+ if (atomic_long_read(&rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&rq->removed_util_avg, 0);
+
+ sub_positive(&rq->avg.util_avg, r);
+ sub_positive(&rq->avg.util_sum, r * SCHED_AVG_MAX);
removed_util = 1;
}

@@ -2999,6 +3031,8 @@ static inline void update_sched_avg(struct sched_entity *se, int update_tg)
*/
static void attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct rq *rq = rq_of(cfs_rq);
+
if (!sched_feat(ATTACH_AGE_LOAD))
goto skip_aging;

@@ -3022,8 +3056,8 @@ skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
- cfs_rq->avg.util_avg += se->avg.util_avg;
- cfs_rq->avg.util_sum += se->avg.util_sum;
+ rq->avg.util_avg += se->avg.util_avg;
+ rq->avg.util_sum += se->avg.util_sum;

cfs_rq_util_change(cfs_rq);
}
@@ -3038,14 +3072,16 @@ skip_aging:
*/
static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct rq *rq = rq_of(cfs_rq);
+
__update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
&se->avg, se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);

sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ sub_positive(&rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&rq->avg.util_sum, se->avg.util_sum);

cfs_rq_util_change(cfs_rq);
}
@@ -3117,6 +3153,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
static void remove_entity_sched_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct rq *rq = rq_of(cfs_rq);
u64 last_update_time;

/*
@@ -3134,7 +3171,7 @@ static void remove_entity_sched_avg(struct sched_entity *se)
__update_sched_avg(last_update_time, cpu_of(rq_of(cfs_rq)),
&se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+ atomic_long_add(se->avg.util_avg, &rq->removed_util_avg);
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -5332,7 +5369,7 @@ done:
* compare the utilization with the capacity of the CPU that is available for
* CFS task (ie cpu_capacity).
*
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * rq->avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
@@ -5341,9 +5378,9 @@ done:
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * Nevertheless, rq->avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * rq->avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
@@ -5354,7 +5391,7 @@ done:
*/
static int cpu_util(int cpu)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long util = cpu_rq(cpu)->avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);

return (util >= capacity) ? capacity : util;
@@ -8533,7 +8570,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#endif
#ifdef CONFIG_SMP
atomic_long_set(&cfs_rq->removed_load_avg, 0);
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4723d4a..f6785c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -398,7 +398,7 @@ struct cfs_rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
#endif
- atomic_long_t removed_load_avg, removed_util_avg;
+ atomic_long_t removed_load_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
@@ -662,6 +662,9 @@ struct rq {

/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
+
+ struct sched_avg avg;
+ atomic_long_t removed_util_avg;
#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
--
1.7.9.5