[PATCH] sched/fair: fix fairness problems among the tasks in different cgroups

From: Joonwoo Park
Date: Tue Oct 18 2016 - 17:27:08 EST


When a new cgroup is created, scheduler attaches the child cgroup
to its parent and also increases the parent's task_group load_avg to
account increased load with following path :

sched_create_group()
alloc_fair_sched_group()
sched_online_group()
online_fair_sched_group()
for_each_possible_cpu()
post_init_entity_util_avg()
update_tg_load_avg()

However the parent's load_avg is shared by all cpus hence it's being
increased number of cpu times. For example when there are 8 cpus
available (in fact 1 available cpu after hotplugging out too),
making empty cgroups /grp1 and /grp1/grp11 leads each task_group's
load_avg to be 8092 and 1024 whereas desired both cgroup's task_group
load_avg is 1024 which happens when booting with 1 cpu at present.

Such an incorrect load_avg accounting causes quite steep unfairness
to the tasks when they are in different cgroups.
With a scenario when online cpus = 1, possible cpus = 4 and 2 cpu
bound tasks are running but each runs on the parent and the child
cgroup :

# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > /sys/devices/system/cpu/cpu2/online
# echo 0 > /sys/devices/system/cpu/cpu3/online
# cat /sys/devices/system/cpu/online
0
# mkdir /sys/fs/cgroup/grp1
# dd if=/dev/zero of=/dev/null &
# echo $! > /sys/fs/cgroup/tasks
# dd if=/dev/zero of=/dev/null &
# echo $! > /sys/fs/cgroup/grp1/tasks

After 3 seconds, the task in the root cgroup got 4 times of execution
time than the task in the child cgroup because weight of possible cpu
is 4 so scheduler thinks the root cgroup has 4 times more load than
child cgroup.

dd (2029, #threads: 1)
se.exec_start : 562900.460656
se.sum_exec_runtime : 2573.175002
dd (2032, #threads: 1)
se.exec_start : 562900.037152
se.sum_exec_runtime : 655.439360

Whereas booting the same system with maxcpus=1 makes both tasks run
evenly.

dd (1952, #threads: 1)
se.exec_start : 75660.457449
se.sum_exec_runtime : 1754.045078
dd (1955, #threads: 1)
se.exec_start : 75680.029689
se.sum_exec_runtime : 1768.195390

Fix such fairness problems by updating parent's task group load_avg
only once when a new child cgroup is being created.

Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Signed-off-by: Joonwoo Park <joonwoop@xxxxxxxxxxxxxx>
---
kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 9 ++++++---
kernel/sched/sched.h | 3 ++-
3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1..2cf46aa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2577,7 +2577,7 @@ void wake_up_new_task(struct task_struct *p)
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
- post_init_entity_util_avg(&p->se);
+ post_init_entity_util_avg(&p->se, true);

activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a..71c08a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -730,7 +730,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct sched_entity *se, bool update_tg_load)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
@@ -770,7 +770,8 @@ void post_init_entity_util_avg(struct sched_entity *se)

update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ if (update_tg_load)
+ update_tg_load_avg(cfs_rq, false);
}

#else /* !CONFIG_SMP */
@@ -8872,15 +8873,17 @@ void online_fair_sched_group(struct task_group *tg)
struct sched_entity *se;
struct rq *rq;
int i;
+ bool update_tg_load = true;

for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];

raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
+ post_init_entity_util_avg(se, update_tg_load);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
+ update_tg_load = false;
}
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935..6ab89af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1356,7 +1356,8 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);

extern void init_entity_runnable_average(struct sched_entity *se);
-extern void post_init_entity_util_avg(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se,
+ bool update_tg_load);

#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
--
2.9.3