Re: [PATCH 4/7 v3] sched: propagate load during synchronous attach/detach

From: Wanpeng Li
Date: Sun Sep 18 2016 - 23:19:45 EST


2016-09-12 15:47 GMT+08:00 Vincent Guittot <vincent.guittot@xxxxxxxxxx>:
> When a task moves from/to a cfs_rq, we set a flag which is then used to
> propagate the change at parent level (sched_entity and cfs_rq) during
> next update. If the cfs_rq is throttled, the flag will stay pending until
> the cfs_rw is unthrottled.
>
> For propagating the utilization, we copy the utilization of child cfs_rq to
> the sched_entity.
>
> For propagating the load, we have to take into account the load of the
> whole task group in order to evaluate the load of the sched_entity.
> Similarly to what was done before the rewrite of PELT, we add a correction
> factor in case the task group's load is less than its share so it will
> contribute the same load of a task of equal weight.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> ---
> kernel/sched/fair.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++-
> kernel/sched/sched.h | 1 +
> 2 files changed, 170 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0aa1d7d..e4015f6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3017,6 +3017,132 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
> }
> }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +/* Take into account change of utilization of a child task group */
> +static inline void
> +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + struct cfs_rq *gcfs_rq = group_cfs_rq(se);
> + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
> +
> + /* Nothing to update */
> + if (!delta)
> + return;
> +
> + /* Set new sched_entity's utilizaton */

s/utilizaton/utilization

> + se->avg.util_avg = gcfs_rq->avg.util_avg;
> + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
> +
> + /* Update parent cfs_rq utilization */
> + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg + delta, 0);
> + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
> +}
> +
> +/* Take into account change of load of a child task group */
> +static inline void
> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + struct cfs_rq *gcfs_rq = group_cfs_rq(se);
> + long delta, load = gcfs_rq->avg.load_avg;
> +
> + /* If the load of group cfs_rq is null, the load of the
> + * sched_entity will also be null so we can skip the formula
> + */
> + if (load) {
> + long tg_load;
> +
> + /* Get tg's load and ensure tg_load > 0 */
> + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
> +
> + /* Ensure tg_load >= load and updated with current load*/
> + tg_load -= gcfs_rq->tg_load_avg_contrib;
> + tg_load += load;
> +
> + /* scale gcfs_rq's load into tg's shares*/
> + load *= scale_load_down(gcfs_rq->tg->shares);
> + load /= tg_load;
> +
> + /*
> + * we need to compute a correction term in the case that the
> + * task group is consuming <1 cpu so that we would contribute
> + * the same load as a task of equal weight.
> + */
> + if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {
> + load *= tg_load;
> + load /= scale_load_down(gcfs_rq->tg->shares);
> + }
> + }
> +
> + delta = load - se->avg.load_avg;
> +
> + /* Nothing to update */
> + if (!delta)
> + return;
> +
> + /* Set new sched_entity's load */
> + se->avg.load_avg = load;
> + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
> +
> + /* Update parent cfs_rq load */
> + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg + delta, 0);
> + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
> +}
> +
> +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
> +{
> + /* set cfs_rq's flag */
> + cfs_rq->propagate_avg = 1;
> +}
> +
> +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
> +{
> + /* Get my cfs_rq */
> + struct cfs_rq *cfs_rq = group_cfs_rq(se);
> +
> + /* Nothing to propagate */
> + if (!cfs_rq->propagate_avg)
> + return 0;
> +
> + /* Clear my cfs_rq's flag */
> + cfs_rq->propagate_avg = 0;
> +
> + return 1;
> +}
> +
> +/* Update task and its cfs_rq load average */
> +static inline int propagate_entity_load_avg(struct sched_entity *se)
> +{
> + struct cfs_rq *cfs_rq;
> +
> + if (entity_is_task(se))
> + return 0;
> +
> + if (!test_and_clear_tg_cfs_propagate(se))
> + return 0;
> +
> + /* Get parent cfs_rq */
> + cfs_rq = cfs_rq_of(se);
> +
> + /* Propagate to parent */
> + set_tg_cfs_propagate(cfs_rq);
> +
> + /* Update utilization */
> + update_tg_cfs_util(cfs_rq, se);
> +
> + /* Update load */
> + update_tg_cfs_load(cfs_rq, se);
> +
> + return 1;
> +}
> +#else
> +static inline int propagate_entity_load_avg(struct sched_entity *se)
> +{
> + return 0;
> +}
> +
> +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
> +#endif
> +
> /*
> * Unsigned subtract and clamp on underflow.
> *
> @@ -3093,6 +3219,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg,
> u64 now = cfs_rq_clock_task(cfs_rq);
> struct rq *rq = rq_of(cfs_rq);
> int cpu = cpu_of(rq);
> + int decayed;
>
> /*
> * Track task load average for carrying it to new CPU after migrated, and
> @@ -3103,7 +3230,11 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg,
> se->on_rq * scale_load_down(se->load.weight),
> cfs_rq->curr == se, NULL);
>
> - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
> + decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
> +
> + decayed |= propagate_entity_load_avg(se);
> +
> + if (decayed && update_tg)
> update_tg_load_avg(cfs_rq, 0);
> }
>
> @@ -3122,6 +3253,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> cfs_rq->avg.load_sum += se->avg.load_sum;
> cfs_rq->avg.util_avg += se->avg.util_avg;
> cfs_rq->avg.util_sum += se->avg.util_sum;
> + set_tg_cfs_propagate(cfs_rq);
>
> cfs_rq_util_change(cfs_rq);
> }
> @@ -3141,6 +3273,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
> sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
> sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
> + set_tg_cfs_propagate(cfs_rq);
>
> cfs_rq_util_change(cfs_rq);
> }
> @@ -8499,6 +8632,22 @@ static void detach_task_cfs_rq(struct task_struct *p)
> update_load_avg(se, 0, 0);
> detach_entity_load_avg(cfs_rq, se);
> update_tg_load_avg(cfs_rq, false);
> +
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + /*
> + * Propagate the detach across the tg tree to make it visible to the
> + * root
> + */
> + se = se->parent;
> + for_each_sched_entity(se) {
> + cfs_rq = cfs_rq_of(se);
> +
> + if (cfs_rq_throttled(cfs_rq))
> + break;
> +
> + update_load_avg(se, 1, 0);
> + }
> +#endif
> }
>
> static void attach_entity_cfs_rq(struct sched_entity *se)
> @@ -8517,6 +8666,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
> update_load_avg(se, 0, !sched_feat(ATTACH_AGE_LOAD));
> attach_entity_load_avg(cfs_rq, se);
> update_tg_load_avg(cfs_rq, false);
> +
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + /*
> + * Propagate the attach across the tg tree to make it visible to the
> + * root
> + */
> + se = se->parent;
> + for_each_sched_entity(se) {
> + cfs_rq = cfs_rq_of(se);
> +
> + if (cfs_rq_throttled(cfs_rq))
> + break;
> +
> + update_load_avg(se, 1, 0);
> + }
> +#endif
> }
>
> static void attach_task_cfs_rq(struct task_struct *p)
> @@ -8578,6 +8743,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
> #endif
> #ifdef CONFIG_SMP
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + cfs_rq->propagate_avg = 0;
> +#endif
> atomic_long_set(&cfs_rq->removed_load_avg, 0);
> atomic_long_set(&cfs_rq->removed_util_avg, 0);
> #endif
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 483616a..0517a9e 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -397,6 +397,7 @@ struct cfs_rq {
> unsigned long runnable_load_avg;
> #ifdef CONFIG_FAIR_GROUP_SCHED
> unsigned long tg_load_avg_contrib;
> + unsigned long propagate_avg;
> #endif
> atomic_long_t removed_load_avg, removed_util_avg;
> #ifndef CONFIG_64BIT
> --
> 1.9.1
>