[PATCH v2 2/3] sched/fair: Skip detach and attach new group task

From: Yuyang Du
Date: Tue May 31 2016 - 02:30:30 EST


Vincent reported that the first task to a new task group's cfs_rq will
be attached in attach_task_cfs_rq() and once more when it is enqueued
(see https://lkml.org/lkml/2016/5/25/388).

Actually, it is worse, attach_task_cfs_rq() is invoked for new task even
way before the new task is initiated in init_entity_runnable_average().

Solve this by avoiding attach as well as detach new task in
task_move_group_fair(). To do it, we need to know whether the task
is forked or not, so we pass this info all the way from sched_move_task()
to attach_task_cfs_rq().

Reported-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
kernel/sched/auto_group.c | 2 +-
kernel/sched/core.c | 8 ++++----
kernel/sched/fair.c | 17 ++++++++++++-----
kernel/sched/sched.h | 4 ++--
4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966c..e5f0be2 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,7 +143,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
goto out;

for_each_thread(p, t)
- sched_move_task(t);
+ sched_move_task(t, 0);
out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4..8585032 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7724,7 +7724,7 @@ void sched_offline_group(struct task_group *tg)
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
* reflect its new group.
*/
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, int fork)
{
struct task_group *tg;
int queued, running;
@@ -7753,7 +7753,7 @@ void sched_move_task(struct task_struct *tsk)

#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ tsk->sched_class->task_move_group(tsk, fork);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -8186,7 +8186,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)

static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ sched_move_task(task, 1);
}

static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -8213,7 +8213,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;

cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, 0);
}

#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a5bdbeb..5b34286 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2970,6 +2970,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq);
}

+/* Catch up with the cfs_rq and then remove our sched avgs from it */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -8369,7 +8370,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}

- /* Catch up with the cfs_rq and remove our load when we leave */
detach_entity_load_avg(cfs_rq, se);
}

@@ -8386,7 +8386,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif

- /* Synchronize task with its cfs_rq */
+ /* Synchronize and attach task to its cfs_rq */
attach_entity_load_avg(cfs_rq, se);

if (!vruntime_normalized(p))
@@ -8468,11 +8468,18 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}

#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p)
+static void task_move_group_fair(struct task_struct *p, int fork)
{
- detach_task_cfs_rq(p);
+ /*
+ * New task does not need detach or attach (see below)
+ */
+ if (!fork)
+ detach_task_cfs_rq(p);
+
set_task_rq(p, task_cpu(p));
- attach_task_cfs_rq(p);
+
+ if (!fork)
+ attach_task_cfs_rq(p);
}

void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f30..58b1259 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -343,7 +343,7 @@ extern void sched_online_group(struct task_group *tg,
extern void sched_destroy_group(struct task_group *tg);
extern void sched_offline_group(struct task_group *tg);

-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, int fork);

#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
@@ -1247,7 +1247,7 @@ struct sched_class {
void (*update_curr) (struct rq *rq);

#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_move_group) (struct task_struct *p, int fork);
#endif
};

--
1.7.9.5