Re: [PATCH 4/4] sched/fair: Implement flat hierarchical structure for util_avg

From: Vincent Guittot
Date: Mon Apr 11 2016 - 08:29:56 EST


Hi Yuyang,

On 11 April 2016 at 00:36, Yuyang Du <yuyang.du@xxxxxxxxx> wrote:
> The utilization part of the sched averages follows task group's
> hierarchy, but the utils of all group entities and all cfs_rqs except
> the top cfs_rq are needless to update and are never used. And more
> importantly, the top cfs_rq's util will not reflect migration of a
> group task.
>
> So this patch propose a flat task hierarchy for util - all tasks
> affiliated to a rq are flat, ignoring their task group levels, and
> the rq's util is the sum of all the tasks.

In fact, it's only about cfs tasks and not all tasks of rq so your
explanation is misleading and the creation of a new sched_avg struct
at rq level too

>
> Overhead wise, the rg's util update may be more costly, but we don't

s/rg/rq/

> update any group entity's util, so the net overhead should not be
> formidable. In addition, rq's util updates can be very frequent,
> but the updates in a period will be skipped, this is mostly effective
> in update_blocked_averages().

Not sure to catch your point here but the rq->util of an idle CPU will
never be updated before a (idle) load balance as we call
update_cfs_rq_load_avg which doesn't update the cfs_rq->avg.util_avg
anymore nor rq->avg.util_avg.

>
> Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
> ---
> kernel/sched/debug.c | 11 ++---
> kernel/sched/fair.c | 123 +++++++++++++++++++++++++++++++--------------------
> kernel/sched/sched.h | 5 ++-
> 3 files changed, 85 insertions(+), 54 deletions(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 4fbc3bd..ba08d1c 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -395,7 +395,6 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
> P(se->load.weight);
> #ifdef CONFIG_SMP
> P(se->avg.load_avg);
> - P(se->avg.util_avg);
> #endif
> #undef PN
> #undef P
> @@ -469,6 +468,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
> print_task(m, rq, p);
> }
> rcu_read_unlock();
> +
> + SEQ_printf(m, "\nutilization: \n");
> + SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
> + rq->avg.util_avg);
> + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
> + atomic_long_read(&rq->removed_util_avg));
> }
>
> void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> @@ -517,12 +522,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> cfs_rq->avg.load_avg);
> SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
> cfs_rq->runnable_load_avg);
> - SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
> - cfs_rq->avg.util_avg);
> SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
> atomic_long_read(&cfs_rq->removed_load_avg));
> - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
> - atomic_long_read(&cfs_rq->removed_util_avg));
> #ifdef CONFIG_FAIR_GROUP_SCHED
> SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
> cfs_rq->tg_load_avg_contrib);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 49e9f1a..67c2730 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -686,45 +686,27 @@ void init_entity_runnable_average(struct sched_entity *se)
>
> /*
> * With new tasks being created, their initial util_avgs are extrapolated
> - * based on the cfs_rq's current util_avg:
> + * based on the rq's current util_avg. To make the util_avg converge, we
> + * cap the util_avg of successive tasks to only 1/2 of the left utilization
> + * budget:
> *
> - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
> - *
> - * However, in many cases, the above util_avg does not give a desired
> - * value. Moreover, the sum of the util_avgs may be divergent, such
> - * as when the series is a harmonic series.
> - *
> - * To solve this problem, we also cap the util_avg of successive tasks to
> - * only 1/2 of the left utilization budget:
> - *
> - * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
> + * util_avg = (1024 - rq->avg.util_avg) / 2^n
> *
> * where n denotes the nth task.
> *
> * For example, a simplest series from the beginning would be like:
> *
> - * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
> - * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
> - *
> - * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
> - * if util_avg > util_avg_cap.
> + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
> + * rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
> */
> void post_init_entity_util_avg(struct sched_entity *se)
> {
> - struct cfs_rq *cfs_rq = cfs_rq_of(se);
> + struct rq *rq = rq_of(cfs_rq_of(se));
> struct sched_avg *sa = &se->avg;
> - long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
> + long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - rq->avg.util_avg) / 2;
>
> if (cap > 0) {
> - if (cfs_rq->avg.util_avg != 0) {
> - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
> - sa->util_avg /= (cfs_rq->avg.load_avg + 1);
> -
> - if (sa->util_avg > cap)
> - sa->util_avg = cap;
> - } else {
> - sa->util_avg = cap;
> - }
> + sa->util_avg = cap;
> sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
> }
> }
> @@ -2697,6 +2679,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> u64 delta;
> u32 contrib, periods;
> unsigned long scale_freq, scale_cpu;
> + int update_util = 0;
>
> /*
> * now rolls down to a period boundary
> @@ -2720,15 +2703,19 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> return 0;
> sa->last_update_time = now;
>
> + /*
> + * We update util_sum together with load_sum only if it is a task
> + */
> + if (!cfs_rq && entity_is_task(container_of(sa, struct sched_entity, avg)))
> + update_util = 1;
> +
> scale_freq = arch_scale_freq_capacity(NULL, cpu);
> scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
>
> /*
> * Now we know we're crossing period boundaries, and the *_sum accrues by
> * two steps.
> - */
> -
> - /*
> + *
> * Step 1: decay old *_sum
> */
> sa->load_sum = __decay_sum(sa->load_sum, periods);
> @@ -2736,7 +2723,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> cfs_rq->runnable_load_sum =
> __decay_sum(cfs_rq->runnable_load_sum, periods);
> }
> - sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
> + if (update_util)
> + sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
>
> /*
> * Step 2: accumulate and meanwhile decay new *_sum by periods since
> @@ -2749,7 +2737,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> if (cfs_rq)
> cfs_rq->runnable_load_sum += weight * contrib;
> }
> - if (running)
> + if (update_util && running)
> sa->util_sum += contrib * scale_cpu;
>
> /*
> @@ -2760,6 +2748,41 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> cfs_rq->runnable_load_avg =
> div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> }
> + if (update_util)
> + sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
> +
> + if (!cfs_rq)
> + return 1;
> +
> + /*
> + * Update rq's util_sum and util_avg
> + */

Do we really have to update the utilization of rq each time we update
a sched_entity ? IMHO, we don't need to do this update so often even
more if the se is a task_group. But even if it's a task, we don't
especially need to update it right now but we can wait for the update
of the rq->cfs like previously or we explicilty update it when we have
to do a direct sub/add on the util_avg value
See also my comment on removed_util_avg below

> + sa = &rq_of(cfs_rq)->avg;


Why not using the sched_avg of the rq->cfs in order to track the
utilization of the root cfs_rq instead of adding a new sched_avg into
the rq ? Then you call update_cfs_rq_load_avg(rq->cfs) when you want
to update/sync the utilization of the rq->cfs and for one call you
will update both the load_avg and the util_avg of the root cfs instead
of duplicating the sequence in _update_load_avg

> +
> + /*
> + * Of course, we have new delta and periods.
> + */
> + delta = now - sa->last_update_time;
> + periods = delta >> 20;
> +
> + /*
> + * The rq's util should be updated much more frequently
> + * than any cfs_rq. If we are here, child cfs_rq should have
> + * already updated load_avg, so we return 1 anyway.
> + */
> + if (!periods)
> + return 1;
> + sa->last_update_time = now;
> +
> + /* Step 1 */
> + sa->load_sum = __decay_sum(sa->load_sum, periods);
> +
> + /* Step 2 */
> + contrib = __accumulate_sum(periods);
> + contrib = cap_scale(contrib, scale_freq);
> + if (cfs_rq->curr != NULL) /* new running */
> + sa->util_sum += contrib * scale_cpu;
> +
> sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
>
> return 1;
> @@ -2843,6 +2866,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> {
> struct sched_avg *sa = &cfs_rq->avg;
> int decayed, removed = 0;
> + struct rq *rq = rq_of(cfs_rq);
>
> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> @@ -2851,10 +2875,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> removed = 1;
> }
>
> - if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> - sa->util_avg = max_t(long, sa->util_avg - r, 0);
> - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> + if (atomic_long_read(&rq->removed_util_avg)) {
> + long r = atomic_long_xchg(&rq->removed_util_avg, 0);
> + rq->avg.util_avg = max_t(long, rq->avg.util_avg - r, 0);
> + rq->avg.util_sum = max_t(s32, rq->avg.util_sum - r * LOAD_AVG_MAX, 0);

I see one potential issue here because the rq->util_avg may (surely)
have been already updated and decayed during the update of a
sched_entity but before we substract the removed_util_avg

> }
>
> decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
> @@ -2907,12 +2931,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
> * See cpu_util().
> */
> cpufreq_update_util(rq_clock(rq),
> - min(cfs_rq->avg.util_avg, max), max);
> + min(rq->avg.util_avg, max), max);
> }
> }
>
> static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> + struct rq *rq = rq_of(cfs_rq);
> +
> if (!sched_feat(ATTACH_AGE_LOAD))
> goto skip_aging;
>
> @@ -2934,20 +2960,22 @@ skip_aging:
> se->avg.last_update_time = cfs_rq->avg.last_update_time;
> cfs_rq->avg.load_avg += se->avg.load_avg;
> cfs_rq->avg.load_sum += se->avg.load_sum;
> - cfs_rq->avg.util_avg += se->avg.util_avg;
> - cfs_rq->avg.util_sum += se->avg.util_sum;
> + rq->avg.util_avg += se->avg.util_avg;
> + rq->avg.util_sum += se->avg.util_sum;
> }
>
> static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> + struct rq *rq = rq_of(cfs_rq);
> +
> __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> &se->avg, se->on_rq * scale_load_down(se->load.weight),
> cfs_rq->curr == se, NULL);
>
> cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
> - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
> - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
> + rq->avg.util_avg = max_t(long, rq->avg.util_avg - se->avg.util_avg, 0);
> + rq->avg.util_sum = max_t(s32, rq->avg.util_sum - se->avg.util_sum, 0);
> }
>
> /* Add the load generated by se into cfs_rq's load average */
> @@ -3017,6 +3045,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
> void remove_entity_load_avg(struct sched_entity *se)
> {
> struct cfs_rq *cfs_rq = cfs_rq_of(se);
> + struct rq *rq = rq_of(cfs_rq);
> u64 last_update_time;
>
> /*
> @@ -3030,7 +3059,7 @@ void remove_entity_load_avg(struct sched_entity *se)
>
> __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
> atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
> - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
> + atomic_long_add(se->avg.util_avg, &rq->removed_util_avg);
> }
>
> static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
> @@ -5145,7 +5174,7 @@ done:
> * compare the utilization with the capacity of the CPU that is available for
> * CFS task (ie cpu_capacity).
> *
> - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
> + * rq->avg.util_avg is the sum of running time of runnable tasks plus the
> * recent utilization of currently non-runnable tasks on a CPU. It represents
> * the amount of utilization of a CPU in the range [0..capacity_orig] where
> * capacity_orig is the cpu_capacity available at the highest frequency
> @@ -5154,9 +5183,9 @@ done:
> * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
> * the running time on this CPU scaled by capacity_curr.
> *
> - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
> + * Nevertheless, rq->avg.util_avg can be higher than capacity_curr or even
> * higher than capacity_orig because of unfortunate rounding in
> - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
> + * rq->avg.util_avg or just after migrating tasks and new task wakeups until
> * the average stabilizes with the new running time. We need to check that the
> * utilization stays within the range of [0..capacity_orig] and cap it if
> * necessary. Without utilization capping, a group could be seen as overloaded
> @@ -5167,7 +5196,7 @@ done:
> */
> static int cpu_util(int cpu)
> {
> - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
> + unsigned long util = cpu_rq(cpu)->avg.util_avg;
> unsigned long capacity = capacity_orig_of(cpu);
>
> return (util >= capacity) ? capacity : util;
> @@ -8334,7 +8363,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> #endif
> #ifdef CONFIG_SMP
> atomic_long_set(&cfs_rq->removed_load_avg, 0);
> - atomic_long_set(&cfs_rq->removed_util_avg, 0);
> #endif
> }
>
> @@ -8399,7 +8427,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
> init_cfs_rq(cfs_rq);
> init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
> init_entity_runnable_average(se);
> - post_init_entity_util_avg(se);
> }
>
> return 1;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a7cbad7..c64f8b8 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -390,7 +390,7 @@ struct cfs_rq {
> #ifdef CONFIG_FAIR_GROUP_SCHED
> unsigned long tg_load_avg_contrib;
> #endif
> - atomic_long_t removed_load_avg, removed_util_avg;
> + atomic_long_t removed_load_avg;
> #ifndef CONFIG_64BIT
> u64 load_last_update_time_copy;
> #endif
> @@ -652,6 +652,9 @@ struct rq {
>
> /* This is used to determine avg_idle's max value */
> u64 max_idle_balance_cost;
> +
> + struct sched_avg avg;
> + atomic_long_t removed_util_avg;
> #endif
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> --
> 2.1.4
>