[PATCH 3/3] sched/fair: Propagate runnable_load_avg independently from load_avg

From: Tejun Heo
Date: Thu May 04 2017 - 16:30:18 EST


We noticed that with cgroup CPU controller in use, the scheduling
latency gets wonky regardless of nesting level or weight
configuration. This is easily reproducible with Chris Mason's
schbench[1].

All tests are run on a single socket, 16 cores, 32 threads machine.
While the machine is mostly idle, it isn't completey. There's always
some variable management load going on. The command used is

schbench -m 2 -t 16 -s 10000 -c 15000 -r 30

which measures scheduling latency with 32 threads each of which
repeatedly runs for 15ms and then sleeps for 10ms. Here's a
representative result when running from the root cgroup.

# ~/schbench -m 2 -t 16 -s 10000 -c 15000 -r 30
Latency percentiles (usec)
50.0000th: 26
75.0000th: 62
90.0000th: 74
95.0000th: 86
*99.0000th: 887
99.5000th: 3692
99.9000th: 10832
min=0, max=13374

The following is inside a first level CPU cgroup with the maximum
weight.

# ~/schbench -m 2 -t 16 -s 10000 -c 15000 -r 30
Latency percentiles (usec)
50.0000th: 31
75.0000th: 65
90.0000th: 71
95.0000th: 91
*99.0000th: 7288
99.5000th: 10352
99.9000th: 12496
min=0, max=13023

Note the drastic increase in p99 scheduling latency. After
investigation, it turned out that the update_sd_lb_stats(), which is
used by load_balance() to pick the most loaded group, was often
picking the wrong group. A CPU which has one schbench running and
another queued wouldn't report the correspondingly higher
weighted_cpuload() and get looked over as the target of load
balancing.

weighted_cpuload() is the root cfs_rq's runnable_load_avg which is the
sum of the load_avg of all active sched_entities. Without cgroups or
at the root cgroup, each task's load_avg contributes directly to the
sum. When a task wakes up or goes to sleep, the change is immediately
reflected on runnable_load_avg which in turn affects load balancing.

However, when CPU cgroup is in use, a nested cfs_rq blocks this
immediate propagation. When a task wakes up inside a cgroup, the
nested cfs_rq's runnable_load_avg is updated but doesn't get
propagated to the parent. It only affects the matching sched_entity's
load_avg over time which then gets propagated to the runnable_load_avg
at that level. This makes the runnable_load_avg at the root queue
incorrectly include blocked load_avgs of tasks queued in nested
cfs_rqs causing the load balancer to make the wrong choices.

This patch fixes the bug by propagating nested cfs_rq's
runnable_load_avg independently from load_avg. Tasks still contribute
to its cfs_rq's runnable_load_avg the same way; however, a nested
cfs_rq directly propagates the scaled runnable_load_avg to the
matching group sched_entity's avg.runnable_load_avg and keeps the se's
and parent cfs_rq's runnable_load_avg in sync.

This ensures that, for any given cfs_rq, its runnable_load_avg is the
sum of the scaled load_avgs of all and only active tasks queued on it
and its descendants. This allows the load balancer to operate on the
same information whether there are nested cfs_rqs or not.

With the patch applied, the p99 latency from inside a cgroup is
equivalent to the root cgroup case.

# ~/schbench -m 2 -t 16 -s 10000 -c 15000 -r 30
Latency percentiles (usec)
50.0000th: 40
75.0000th: 71
90.0000th: 89
95.0000th: 108
*99.0000th: 679
99.5000th: 3500
99.9000th: 10960
min=0, max=13790

[1] git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Mike Galbraith <efault@xxxxxx>
Cc: Paul Turner <pjt@xxxxxxxxxx>
Cc: Chris Mason <clm@xxxxxx>
---
kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 37 insertions(+), 18 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3098,6 +3098,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}

+static inline void
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long load, delta;
+
+ load = scale_load_down(calc_cfs_shares(gcfs_rq, gcfs_rq->tg,
+ shares_runnable));
+ delta = load - se->avg.runnable_load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.runnable_load_avg = load;
+ se->avg.runnable_load_sum = se->avg.runnable_load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.runnable_load_avg, delta);
+ cfs_rq->avg.runnable_load_sum =
+ cfs_rq->avg.runnable_load_avg * LOAD_AVG_MAX;
+}
+
/* Take into account change of load of a child task group */
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3120,17 +3144,6 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
/* Update parent cfs_rq load */
add_positive(&cfs_rq->avg.load_avg, delta);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
-
- /*
- * If the sched_entity is already enqueued, we also have to update the
- * runnable load avg.
- */
- if (se->on_rq) {
- /* Update parent cfs_rq runnable_load_avg */
- add_positive(&cfs_rq->avg.runnable_load_avg, delta);
- cfs_rq->avg.runnable_load_sum =
- cfs_rq->avg.runnable_load_avg * LOAD_AVG_MAX;
- }
}

static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
@@ -3152,16 +3165,16 @@ static inline int test_and_clear_tg_cfs_
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);

if (entity_is_task(se))
return 0;

+ update_tg_cfs_runnable(cfs_rq, se);
+
if (!test_and_clear_tg_cfs_propagate(se))
return 0;

- cfs_rq = cfs_rq_of(se);
-
set_tg_cfs_propagate(cfs_rq);

update_tg_cfs_util(cfs_rq, se);
@@ -3298,7 +3311,7 @@ static inline void update_load_avg(struc
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
__update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, false);
+ cfs_rq->curr == se, !entity_is_task(se));
}

decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
@@ -3354,8 +3367,10 @@ enqueue_entity_load_avg(struct cfs_rq *c
{
struct sched_avg *sa = &se->avg;

- cfs_rq->avg.runnable_load_avg += sa->load_avg;
- cfs_rq->avg.runnable_load_sum += sa->load_sum;
+ if (entity_is_task(se)) {
+ cfs_rq->avg.runnable_load_avg += sa->load_avg;
+ cfs_rq->avg.runnable_load_sum += sa->load_sum;
+ }

if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
@@ -3369,6 +3384,9 @@ dequeue_entity_load_avg(struct cfs_rq *c
{
struct sched_avg *sa = &se->avg;

+ if (!entity_is_task(se))
+ return;
+
cfs_rq->avg.runnable_load_avg =
max_t(long, cfs_rq->avg.runnable_load_avg - sa->load_avg, 0);
cfs_rq->avg.runnable_load_sum =
@@ -3406,7 +3424,8 @@ void sync_entity_load_avg(struct sched_e
u64 last_update_time;

last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, false);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, 0, 0, !entity_is_task(se));
}

/*