[PATCH 4/4] sched,fair: Fix PELT integrity for new tasks

From: Peter Zijlstra
Date: Fri Jun 17 2016 - 08:07:08 EST


Vincent and Yuyang found another few scenarios in which entity
tracking goes wobbly.

The scenarios are basically due to the fact that new tasks are not
immediately attached and thereby differ from the normal situation -- a
task is always attached to a cfs_rq load average (such that it
includes its blocked contribution) and are explicitly
detached/attached on migration to another cfs_rq.

Scenario 1: switch to fair class

p->sched_class = fair_class;
if (queued)
enqueue_task(p);
...
enqueue_entity()
enqueue_entity_load_avg()
migrated = !sa->last_update_time (true)
if (migrated)
attach_entity_load_avg()
check_class_changed()
switched_from() (!fair)
switched_to() (fair)
switched_to_fair()
attach_entity_load_avg()

If @p is a new task that hasn't been fair before, it will have
!last_update_time and, per the above, end up in
attach_entity_load_avg() _twice_.

Scenario 2: change between cgroups

sched_move_group(p)
if (queued)
dequeue_task()
task_move_group_fair()
detach_task_cfs_rq()
detach_entity_load_avg()
set_task_rq()
attach_task_cfs_rq()
attach_entity_load_avg()
if (queued)
enqueue_task();
...
enqueue_entity()
enqueue_entity_load_avg()
migrated = !sa->last_update_time (true)
if (migrated)
attach_entity_load_avg()

Similar as with scenario 1, if @p is a new task, it will have
!load_update_time and we'll end up in attach_entity_load_avg()
_twice_.

Furthermore, notice how we do a detach_entity_load_avg() on something
that wasn't attached to begin with.

As stated above; the problem is that the new task isn't yet attached
to the load tracking and thereby violates the invariant assumption.

This patch remedies this by ensuring a new task is indeed properly
attached to the load tracking on creation, through
post_init_entity_util_avg().

Of course, this isn't entirely as straight forward as one might think,
since the task is hashed before we call wake_up_new_task() and thus
can be poked at. We avoid this by adding TASK_NEW and teaching
cpu_cgroup_can_attach() to refuse such tasks.

Cc: Yuyang Du <yuyang.du@xxxxxxxxx>
Reported-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 5 +-
kernel/sched/core.c | 98 +++++++++++++++++++++++++++++++++++++-------------
kernel/sched/fair.c | 26 +++++++++----
3 files changed, 94 insertions(+), 35 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -220,9 +220,10 @@ extern void proc_sched_set_task(struct t
#define TASK_WAKING 256
#define TASK_PARKED 512
#define TASK_NOLOAD 1024
-#define TASK_STATE_MAX 2048
+#define TASK_NEW 2048
+#define TASK_STATE_MAX 4096

-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"

extern char ___assert_task_state[1 - 2*!!(
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2340,11 +2340,11 @@ int sched_fork(unsigned long clone_flags

__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;

/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2381,6 +2381,8 @@ int sched_fork(unsigned long clone_flags
p->sched_class = &fair_sched_class;
}

+ init_entity_runnable_average(&p->se);
+
/*
* The child is not yet in the pid-hash so no cgroup attach races,
* and the cgroup is pinned to this child due to cgroup_fork()
@@ -2389,6 +2391,10 @@ int sched_fork(unsigned long clone_flags
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
__set_task_cpu(p, cpu);
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
@@ -2523,16 +2529,18 @@ void wake_up_new_task(struct task_struct
struct rq_flags rf;
struct rq *rq;

- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
@@ -8219,6 +8254,19 @@ static int cpu_cgroup_can_attach(struct
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such
+ * that if its running, we're sure to observe
+ * its full state.
+ */
+ raw_spin_unlock_wait(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT. See
+ * XXX.
+ */
+ if (task->state == TASK_NEW)
+ return -EINVAL;
}
return 0;
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,10 @@ void init_entity_runnable_average(struct
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}

+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -733,18 +737,21 @@ void post_init_entity_util_avg(struct sc
}
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
+
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
-#else
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct sched_entity *se)
{
}
-#endif
+#endif /* CONFIG_SMP */

/*
* Update the current task's runtime statistics.
@@ -2847,8 +2854,6 @@ void set_task_rq_fair(struct sched_entit
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */

-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -2958,6 +2963,8 @@ static void attach_entity_load_avg(struc
/*
* If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time.
+ *
+ * Or we're fresh through post_init_entity_util_avg().
*/
if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3063,11 +3070,14 @@ void remove_entity_load_avg(struct sched
u64 last_update_time;

/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;

last_update_time = cfs_rq_last_update_time(cfs_rq);