[PATCH 3/4] sched,cgroup: Fix cpu_cgroup_fork()

From: Peter Zijlstra
Date: Fri Jun 17 2016 - 08:07:46 EST


From: Vincent Guittot <vincent.guittot@xxxxxxxxxx>

A new fair task is detached and attached from/to task_group with:

cgroup_post_fork()
ss->fork(child) := cpu_cgroup_fork()
sched_move_task()
task_move_group_fair()

Which is wrong, because at this point in fork() the task isn't fully
initialized and it cannot 'move' to another group, because its not
attached to any group as yet.

In fact, cpu_cgroup_fork needs a small part of sched_move_task so we
can just call this small part directly instead sched_move_task. And
the task doesn't really migrate because it is not yet attached so we
need the sequence:

do_fork()
sched_fork()
__set_task_cpu()

cgroup_post_fork()
set_task_rq() # set task group and runqueue

wake_up_new_task()
select_task_rq() can select a new cpu
__set_task_cpu
post_init_entity_util_avg
attach_task_cfs_rq()
activate_task
enqueue_task

This patch makes that happen.

Maybe-Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 67 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 47 insertions(+), 20 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7743,27 +7743,17 @@ void sched_offline_group(struct task_gro
spin_unlock_irqrestore(&task_group_lock, flags);
}

-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
+/*
+ * Set task's runqueue and group.
+ *
+ * In case of a move between group, we update src and dst group thanks to
+ * sched_class->task_move_group. Otherwise, we just need to set runqueue and
+ * group pointers. The task will be attached to the runqueue during its wake
+ * up.
*/
-void sched_move_task(struct task_struct *tsk)
+static void sched_set_group(struct task_struct *tsk, bool move)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);

/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7776,11 +7766,37 @@ void sched_move_task(struct task_struct
tsk->sched_task_group = tg;

#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
+ if (move && tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_set_group(tsk, true);

if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8208,9 +8224,20 @@ static void cpu_cgroup_css_free(struct c
sched_free_group(tg);
}

+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_set_group(task, false);
+
+ task_rq_unlock(rq, task, &rf);
}

static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)