Re: [PATCH 2/2] sched/fair: Skip detach and attach load avgs for new group task

From: Vincent Guittot
Date: Thu May 26 2016 - 07:51:22 EST


On 26 May 2016 at 03:14, Yuyang Du <yuyang.du@xxxxxxxxx> wrote:
> Vincent reported that the first task to a new task group's cfs_rq will
> be attached in attach_task_cfs_rq() and once more when it is enqueued
> (see https://lkml.org/lkml/2016/5/25/388).
>
> Actually, it is worse, attach_task_cfs_rq() is called for new task even
> way before init_entity_runnable_average().
>
> Solve this by avoiding attach as well as detach new task's sched avgs
> in task_move_group_fair(). To do it, we need to know whether the task
> is forked or not, so we pass this info all the way from sched_move_task()
> to attach_task_cfs_rq().

Not sure that this is the right way to solve this problem because you
continue to attach the task twice without detaching it in the mean
time:
- once during the copy of the process in cpu_cgroup_fork (you skip the
attach of load average but the task is still attached to the local
cpu)
In the mean time, sched_entity is initialized and the last_update_time is reset
- one more time when the task is enqueued because the last_update_time
has been reset (this time you don't skip the attache of load_avg

Should you better detach the sched_entity with a copy of its parent
metrics before initializing it and attaching it to the new cpu ?

>
> Reported-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
> ---
> kernel/sched/auto_group.c | 2 +-
> kernel/sched/core.c | 8 ++++----
> kernel/sched/fair.c | 23 ++++++++++++-----------
> kernel/sched/sched.h | 4 ++--
> 4 files changed, 19 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
> index a5d966c..e5f0be2 100644
> --- a/kernel/sched/auto_group.c
> +++ b/kernel/sched/auto_group.c
> @@ -143,7 +143,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
> goto out;
>
> for_each_thread(p, t)
> - sched_move_task(t);
> + sched_move_task(t, 0);
> out:
> unlock_task_sighand(p, &flags);
> autogroup_kref_put(prev);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f2cae4..8585032 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7724,7 +7724,7 @@ void sched_offline_group(struct task_group *tg)
> * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
> * reflect its new group.
> */
> -void sched_move_task(struct task_struct *tsk)
> +void sched_move_task(struct task_struct *tsk, int fork)
> {
> struct task_group *tg;
> int queued, running;
> @@ -7753,7 +7753,7 @@ void sched_move_task(struct task_struct *tsk)
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> if (tsk->sched_class->task_move_group)
> - tsk->sched_class->task_move_group(tsk);
> + tsk->sched_class->task_move_group(tsk, fork);
> else
> #endif
> set_task_rq(tsk, task_cpu(tsk));
> @@ -8186,7 +8186,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
>
> static void cpu_cgroup_fork(struct task_struct *task)
> {
> - sched_move_task(task);
> + sched_move_task(task, 1);
> }
>
> static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -8213,7 +8213,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
> struct cgroup_subsys_state *css;
>
> cgroup_taskset_for_each(task, css, tset)
> - sched_move_task(task);
> + sched_move_task(task, 0);
> }
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e89c39b..e5a61b1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2970,6 +2970,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> cfs_rq_util_change(cfs_rq);
> }
>
> +/* Catch up with the cfs_rq and then remove our sched avgs from it */
> static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> @@ -8368,9 +8369,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
> place_entity(cfs_rq, se, 0);
> se->vruntime -= cfs_rq->min_vruntime;
> }
> -
> - /* Catch up with the cfs_rq and remove our load when we leave */
> - detach_entity_load_avg(cfs_rq, se);
> }
>
> static void attach_task_cfs_rq(struct task_struct *p)
> @@ -8386,9 +8384,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
> se->depth = se->parent ? se->parent->depth + 1 : 0;
> #endif
>
> - /* Synchronize task with its cfs_rq */
> - attach_entity_load_avg(cfs_rq, se);
> -
> if (!vruntime_normalized(p))
> se->vruntime += cfs_rq->min_vruntime;
> }
> @@ -8396,6 +8391,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> + detach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
> }
>
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> @@ -8422,6 +8418,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
> skip_aging:
> #endif
> attach_task_cfs_rq(p);
> + attach_entity_load_avg(cfs_rq_of(se), se);
>
> if (task_on_rq_queued(p)) {
> /*
> @@ -8468,16 +8465,20 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> }
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> -static void task_move_group_fair(struct task_struct *p)
> +static void task_move_group_fair(struct task_struct *p, int fork)
> {
> detach_task_cfs_rq(p);
> + /*
> + * New task does not need detach or attach load (see below)
> + */
> + if (!fork)
> + detach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
> +
> set_task_rq(p, task_cpu(p));
>
> -#ifdef CONFIG_SMP
> - /* Tell se's cfs_rq has been changed -- migrated */
> - p->se.avg.last_update_time = 0;
> -#endif
> attach_task_cfs_rq(p);
> + if (!fork)
> + attach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
> }
>
> void free_fair_sched_group(struct task_group *tg)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 72f1f30..58b1259 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -343,7 +343,7 @@ extern void sched_online_group(struct task_group *tg,
> extern void sched_destroy_group(struct task_group *tg);
> extern void sched_offline_group(struct task_group *tg);
>
> -extern void sched_move_task(struct task_struct *tsk);
> +extern void sched_move_task(struct task_struct *tsk, int fork);
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
> @@ -1247,7 +1247,7 @@ struct sched_class {
> void (*update_curr) (struct rq *rq);
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> - void (*task_move_group) (struct task_struct *p);
> + void (*task_move_group) (struct task_struct *p, int fork);
> #endif
> };
>
> --
> 1.7.9.5
>