Re: [PATCH v3 2/5] sched/fair: Skip detach and attach new group task

From: Vincent Guittot
Date: Wed Jun 01 2016 - 08:20:35 EST


On 1 June 2016 at 05:41, Yuyang Du <yuyang.du@xxxxxxxxx> wrote:
> Vincent reported that the first task to a new task group's cfs_rq will
> be attached in attach_task_cfs_rq() and once more when it is enqueued
> (see https://lkml.org/lkml/2016/5/25/388).
>
> Actually, it is much worse. The load is currently attached mostly twice
> every time when we switch to fair class or change task groups. These two
> scenarios are concerned, which we will descripbe in the following
> respectively

AFAICT and according to tests that i have done around these 2 use
cases, the task is attached only once during a switched to fair and a
sched_move_task. Have you face such situation during tests ? What is
the sequence that generates this issue ?

>
> 1) Switch to fair class:
>
> The sched class change is done like this:
>
> if (queued)
> enqueue_task();
> check_class_changed()
> switched_from()
> switched_to()
>
> If the task is on_rq, it should have already been enqueued, which
> MAY have attached the load to the cfs_rq, if so, we shouldn't attach

No, it can't. The only way to attach task during enqueue is if
last_update_time has been reset which is not the case during a
switched_to_fair

> it again in switched_to(), otherwise, we will attach it twice. This is
> what the current situation is.
>
> So to cover both the on_rq and !on_rq cases, as well as both the task
> was switched from fair and otherwise, the simplest solution is to reset
> the task's last_update_time to 0, when the task is switched from fair.
> Then let task enqueue do the load attachment.
>
> 2) Change between fair task groups:
>
> The task groups are changed like this:
>
> if (queued)
> dequeue_task()
> task_move_group()
> if (queued)
> enqueue_task()
>
> Unlike the switch to fair class, if the task is on_rq, it will be enqueued
> after we move task groups, so the simplest solution is to reset the
> task's last_update_time when we do task_move_group(), and then let
> enqueue_task() do the load attachment.

Same for this sequence, the task is explicitly attached only once
during the task_move_group but never during the enqueue.

So you want to delay the attach during the enqueue ? But what happen
if the task was not enqueue when it has been moved between groups ?
The load_avg of the task stays frozen during the period because its
last_update_time is reset

>
> Reported-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
> ---
> kernel/sched/fair.c | 47 +++++++++++++++++++++--------------------------
> 1 file changed, 21 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3270598..89513b6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2959,7 +2959,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
> update_tg_load_avg(cfs_rq, 0);
> }
>
> -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +/* Virtually synchronize task with its cfs_rq */
> +static inline void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> se->avg.last_update_time = cfs_rq->avg.last_update_time;
> cfs_rq->avg.load_avg += se->avg.load_avg;
> @@ -2970,19 +2971,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> cfs_rq_util_change(cfs_rq);
> }
>
> -static inline void attach_age_load_task(struct rq *rq, struct task_struct *p)
> -{
> - struct sched_entity *se = &p->se;
> -
> - if (!sched_feat(ATTACH_AGE_LOAD))
> - return;
> -
> - if (se->avg.last_update_time) {
> - __update_load_avg(cfs_rq_of(se)->avg.last_update_time, cpu_of(rq),
> - &se->avg, 0, 0, NULL);
> - }
> -}
> -
> static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> @@ -3057,6 +3045,11 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
> }
> #endif
>
> +static inline void reset_task_last_update_time(struct task_struct *p)
> +{
> + p->se.avg.last_update_time = 0;
> +}
> +
> /*
> * Task first catches up with cfs_rq, and then subtract
> * itself from the cfs_rq (task must be off the queue now).
> @@ -3109,10 +3102,8 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> static inline void remove_entity_load_avg(struct sched_entity *se) {}
>
> static inline void
> -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> -static inline void
> detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> -static inline void attach_age_load_task(struct rq *rq, struct task_struct *p) {}
> +static inline void reset_task_last_update_time(struct task_struct *p) {}
>
> static inline int idle_balance(struct rq *rq)
> {
> @@ -8400,9 +8391,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
> se->depth = se->parent ? se->parent->depth + 1 : 0;
> #endif
>
> - /* Synchronize task with its cfs_rq */
> - attach_entity_load_avg(cfs_rq, se);
> -
> if (!vruntime_normalized(p))
> se->vruntime += cfs_rq->min_vruntime;
> }
> @@ -8410,16 +8398,18 @@ static void attach_task_cfs_rq(struct task_struct *p)
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> + reset_task_last_update_time(p);
> + /*
> + * If we change back to fair class, we will attach the sched
> + * avgs when we are enqueued, which will be done only once. We
> + * won't have the chance to consistently age the avgs before
> + * attaching them, so we have to continue with the last updated
> + * sched avgs when we were detached.
> + */
> }
>
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> - /*
> - * If we change between classes, age the averages before attaching them.
> - * XXX: we could have just aged the entire load away if we've been
> - * absent from the fair class for too long.
> - */
> - attach_age_load_task(rq, p);
> attach_task_cfs_rq(p);
>
> if (task_on_rq_queued(p)) {
> @@ -8472,6 +8462,11 @@ static void task_move_group_fair(struct task_struct *p)
> detach_task_cfs_rq(p);
> set_task_rq(p, task_cpu(p));
> attach_task_cfs_rq(p);
> + /*
> + * This assures we will attach the sched avgs when we are enqueued,
> + * which will be done only once.
> + */
> + reset_task_last_update_time(p);
> }
>
> void free_fair_sched_group(struct task_group *tg)
> --
> 1.7.9.5
>