Re: [RFC][PATCH 03/14] sched/fair: Remove se->load.weight from se->avg.load_sum
From: Vincent Guittot
Date: Wed May 17 2017 - 03:05:20 EST
Hi Peter,
On 12 May 2017 at 18:44, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> Remove the load from the load_sum for sched_entities, basically
> turning load_sum into runnable_sum. This prepares for better
> reweighting of group entities.
>
> Since we now have different rules for computing load_avg, split
> ___update_load_avg() into two parts, ___update_load_sum() and
> ___update_load_avg().
>
> So for se:
>
> ___update_load_sum(.weight = 1)
> ___upate_load_avg(.weight = se->load.weight)
>
> and for cfs_rq:
>
> ___update_load_sum(.weight = cfs_rq->load.weight)
> ___upate_load_avg(.weight = 1)
>
> Since the primary consumable is load_avg, most things will not be
> affected. Only those few sites that initialize/modify load_sum need
> attention.
I wonder if there is a problem with this new way to compute se's
load_avg and cfs_rq's load_avg when a task changes is nice prio before
migrating to another CPU.
se load_avg is now: runnable x current weight
but cfs_rq load_avg keeps the history of the previous weight of the se
When we detach se, we will remove an up to date se's load_avg from
cfs_rq which doesn't have the up to date load_avg in its own load_avg.
So if se's prio decreases just before migrating, some load_avg stays
in prev cfs_rq and if se's prio increases, we will remove too much
load_avg and possibly make the cfs_rq load_avg null whereas other
tasks are running.
Thought ?
I'm able to reproduce the problem with a simple rt-app use case (after
adding a new feature in rt-app)
Vincent
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> kernel/sched/fair.c | 91 ++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 64 insertions(+), 27 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -744,7 +744,7 @@ void init_entity_runnable_average(struct
> */
> if (entity_is_task(se))
> sa->load_avg = scale_load_down(se->load.weight);
> - sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
> + sa->load_sum = LOAD_AVG_MAX;
> /*
> * At this point, util_avg won't be used in select_task_rq_fair anyway
> */
> @@ -1967,7 +1967,7 @@ static u64 numa_get_avg_runtime(struct t
> delta = runtime - p->last_sum_exec_runtime;
> *period = now - p->last_task_numa_placement;
> } else {
> - delta = p->se.avg.load_sum / p->se.load.weight;
> + delta = p->se.avg.load_sum;
> *period = LOAD_AVG_MAX;
> }
>
> @@ -2872,8 +2872,8 @@ accumulate_sum(u64 delta, int cpu, struc
> * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> */
> static __always_inline int
> -___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> - unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> + unsigned long weight, int running, struct cfs_rq *cfs_rq)
> {
> u64 delta;
>
> @@ -2907,39 +2907,80 @@ ___update_load_avg(u64 now, int cpu, str
> if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
> return 0;
>
> + return 1;
> +}
> +
> +static __always_inline void
> +___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
> +{
> + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> +
> /*
> * Step 2: update *_avg.
> */
> - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
> + sa->load_avg = div_u64(weight * sa->load_sum, divider);
> if (cfs_rq) {
> cfs_rq->runnable_load_avg =
> - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
> + div_u64(cfs_rq->runnable_load_sum, divider);
> }
> - sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
> + sa->util_avg = sa->util_sum / divider;
> +}
>
> - return 1;
> +/*
> + * XXX we want to get rid of this helper and use the full load resolution.
> + */
> +static inline long se_weight(struct sched_entity *se)
> +{
> + return scale_load_down(se->load.weight);
> }
>
> +/*
> + * sched_entity:
> + *
> + * load_sum := runnable_sum
> + * load_avg = se_weight(se) * runnable_avg
> + *
> + * cfq_rs:
> + *
> + * load_sum = \Sum se_weight(se) * se->avg.load_sum
> + * load_avg = \Sum se->avg.load_avg
> + */
> +
> static int
> __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> {
> - return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
> + if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
> + ___update_load_avg(&se->avg, se_weight(se), NULL);
> + return 1;
> + }
> +
> + return 0;
> }
>
> static int
> __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - return ___update_load_avg(now, cpu, &se->avg,
> - se->on_rq * scale_load_down(se->load.weight),
> - cfs_rq->curr == se, NULL);
> + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
> + cfs_rq->curr == se, NULL)) {
> +
> + ___update_load_avg(&se->avg, se_weight(se), NULL);
> + return 1;
> + }
> +
> + return 0;
> }
>
> static int
> __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> {
> - return ___update_load_avg(now, cpu, &cfs_rq->avg,
> - scale_load_down(cfs_rq->load.weight),
> - cfs_rq->curr != NULL, cfs_rq);
> + if (___update_load_sum(now, cpu, &cfs_rq->avg,
> + scale_load_down(cfs_rq->load.weight),
> + cfs_rq->curr != NULL, cfs_rq)) {
> + ___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
> + return 1;
> + }
> +
> + return 0;
> }
>
> /*
> @@ -3110,7 +3151,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
>
> /* Set new sched_entity's load */
> se->avg.load_avg = load;
> - se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
> + se->avg.load_sum = LOAD_AVG_MAX;
>
> /* Update parent cfs_rq load */
> add_positive(&cfs_rq->avg.load_avg, delta);
> @@ -3340,7 +3381,7 @@ static void attach_entity_load_avg(struc
> {
> se->avg.last_update_time = cfs_rq->avg.last_update_time;
> cfs_rq->avg.load_avg += se->avg.load_avg;
> - cfs_rq->avg.load_sum += se->avg.load_sum;
> + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
> cfs_rq->avg.util_avg += se->avg.util_avg;
> cfs_rq->avg.util_sum += se->avg.util_sum;
> set_tg_cfs_propagate(cfs_rq);
> @@ -3360,7 +3401,7 @@ static void detach_entity_load_avg(struc
> {
>
> sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
> - sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
> + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
> sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
> sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
> set_tg_cfs_propagate(cfs_rq);
> @@ -3372,12 +3413,10 @@ static void detach_entity_load_avg(struc
> static inline void
> enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - struct sched_avg *sa = &se->avg;
> -
> - cfs_rq->runnable_load_avg += sa->load_avg;
> - cfs_rq->runnable_load_sum += sa->load_sum;
> + cfs_rq->runnable_load_avg += se->avg.load_avg;
> + cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
>
> - if (!sa->last_update_time) {
> + if (!se->avg.last_update_time) {
> attach_entity_load_avg(cfs_rq, se);
> update_tg_load_avg(cfs_rq, 0);
> }
> @@ -3387,10 +3426,8 @@ enqueue_entity_load_avg(struct cfs_rq *c
> static inline void
> dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - cfs_rq->runnable_load_avg =
> - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
> - cfs_rq->runnable_load_sum =
> - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
> + sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
> + sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
> }
>
> #ifndef CONFIG_64BIT
>
>