[RFC][PATCH 09/14] sched/fair: Propagate an effective runnable_load_avg

From: Peter Zijlstra
Date: Fri May 12 2017 - 13:21:15 EST


The load balancer uses runnable_load_avg as load indicator. For
!cgroup this is:

runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq

That is, a direct sum of all runnable tasks on that runqueue. As
opposed to load_avg, which is a sum of all tasks on the runqueue,
which includes a blocked component.

However, in the cgroup case, this comes apart since the group entities
are always runnable, even if most of their constituent entities are
blocked.

Therefore introduce a runnable_weight which for task entities is the
same as the regular weight, but for group entities is a fraction of
the entity weight and represents the runnable part of the group
runqueue.

Then propagate this load through the PELT hierarchy to arrive at an
effective runnable load avgerage -- which we should not confuse with
the canonical runnable load average.

Suggested-by: Tejun Heo <tj@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 3
kernel/sched/debug.c | 8 +
kernel/sched/fair.c | 228 ++++++++++++++++++++++++++++++--------------------
kernel/sched/sched.h | 3
4 files changed, 150 insertions(+), 92 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -314,9 +314,11 @@ struct load_weight {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
+ u64 runnable_load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
+ unsigned long runnable_load_avg;
unsigned long util_avg;
};

@@ -359,6 +361,7 @@ struct sched_statistics {
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
+ unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -396,9 +396,11 @@ static void print_cfs_group_stats(struct
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
+ P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
+ P(se->avg.runnable_load_avg);
#endif

#undef PN_SCHEDSTAT
@@ -513,10 +515,11 @@ void print_cfs_rq(struct seq_file *m, in
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
- cfs_rq->runnable_load_avg);
+ cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
@@ -947,10 +950,13 @@ void proc_sched_show_task(struct task_st
"nr_involuntary_switches", (long long)p->nivcsw);

P(se.load.weight);
+ P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
+ P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
+ P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
#endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -743,8 +743,9 @@ void init_entity_runnable_average(struct
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = LOAD_AVG_MAX;
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+ sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
+
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
@@ -2755,38 +2756,83 @@ static long calc_cfs_shares(struct cfs_r
} while (0)

/*
- * XXX we want to get rid of this helper and use the full load resolution.
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
*/
static inline long se_weight(struct sched_entity *se)
{
return scale_load_down(se->load.weight);
}

+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
+
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight += se->runnable_weight;
+
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight -= se->runnable_weight;
+
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
+ se_runnable(se) * se->avg.runnable_load_sum);
+}
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight)
+ unsigned long weight, unsigned long runnable)
{
unsigned long se_load_avg = se->avg.load_avg;
u64 se_load_sum = se_weight(se) * se->avg.load_sum;
u64 new_load_sum = scale_load_down(weight) * se->avg.load_sum;
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;

if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
+
account_entity_dequeue(cfs_rq, se);
+ dequeue_entity_load_avg(cfs_rq, se);
}

- se->avg.load_avg = div_u64(new_load_sum,
- LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+ se->avg.load_avg = div_u64(new_load_sum, divider);
+ se->avg.runnable_load_avg =
+ div_u64(scale_load_down(runnable) * se->avg.runnable_load_sum, divider);

+ se->runnable_weight = runnable;
update_load_set(&se->load, weight);

if (se->on_rq) {
+ /* XXX delta accounting for these */
+
account_entity_enqueue(cfs_rq, se);
- add_positive(&cfs_rq->runnable_load_avg,
- (long)(se->avg.load_avg - se_load_avg));
- add_positive(&cfs_rq->runnable_load_sum,
- (s64)(new_load_sum - se_load_sum));
+ enqueue_entity_load_avg(cfs_rq, se);
}

add_positive(&cfs_rq->avg.load_avg,
@@ -2797,31 +2843,45 @@ static void reweight_entity(struct cfs_r

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);

-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- long shares;
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long shares, runnable;

- if (!cfs_rq)
+ if (!gcfs_rq)
return;

- if (throttled_hierarchy(cfs_rq))
+ if (throttled_hierarchy(gcfs_rq))
return;

#ifndef CONFIG_SMP
- shares = READ_ONCE(cfs_rq->tg->shares);
+ shares = READ_ONCE(gcfs_rq->tg->shares);

if (likely(se->load.weight == shares))
return;
#else
- shares = calc_cfs_shares(cfs_rq);
+ shares = calc_cfs_shares(gcfs_rq);
#endif
+ /*
+ * The hierarchical runnable load metric is the proportional part
+ * of this group's runnable_load_avg / load_avg.
+ *
+ * Note: we need to deal with very sporadic 'runnable > load' cases
+ * due to numerical instability.
+ */
+ runnable = shares * gcfs_rq->avg.runnable_load_avg;
+ if (runnable)
+ runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg);

- reweight_entity(cfs_rq_of(se), se, shares);
+ reweight_entity(cfs_rq_of(se), se, shares, runnable);
}

#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2905,7 +2965,7 @@ static u32 __accumulate_pelt_segments(u6
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -2922,10 +2982,8 @@ accumulate_sum(u64 delta, int cpu, struc
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods);
- }
+ sa->runnable_load_sum =
+ decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);

/*
@@ -2938,11 +2996,10 @@ accumulate_sum(u64 delta, int cpu, struc
sa->period_contrib = delta;

contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;

@@ -2979,7 +3036,7 @@ accumulate_sum(u64 delta, int cpu, struc
*/
static __always_inline int
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long load, unsigned long runnable, int running)
{
u64 delta;

@@ -3010,45 +3067,60 @@ ___update_load_sum(u64 now, int cpu, str
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;

return 1;
}

static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;

/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(weight * sa->load_sum, divider);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, divider);
- }
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
sa->util_avg = sa->util_sum / divider;
}

/*
* sched_entity:
*
+ * task:
+ * se_runnable() == se_weight()
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
* load_sum := runnable_sum
* load_avg = se_weight(se) * runnable_avg
*
+ * runnable_load_sum := runnable_sum
+ * runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
* cfq_rs:
*
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
+ *
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
*/

static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
- ___update_load_avg(&se->avg, se_weight(se), NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}

@@ -3058,10 +3130,13 @@ __update_load_avg_blocked_se(u64 now, in
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
- cfs_rq->curr == se, NULL)) {
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ cfs_rq->curr == se)) {

- ___update_load_avg(&se->avg, se_weight(se), NULL);
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}

@@ -3073,8 +3148,10 @@ __update_load_avg_cfs_rq(u64 now, int cp
{
if (___update_load_sum(now, cpu, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq)) {
- ___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+ scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->curr != NULL)) {
+
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
return 1;
}

@@ -3253,8 +3330,8 @@ static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long runnable_sum = gcfs_rq->prop_runnable_sum;
- long load_avg;
- s64 load_sum;
+ long runnable_load_avg, load_avg;
+ s64 runnable_load_sum, load_sum;

if (!runnable_sum)
return;
@@ -3270,9 +3347,15 @@ update_tg_cfs_runnable(struct cfs_rq *cf
add_positive(&cfs_rq->avg.load_avg, load_avg);
add_positive(&cfs_rq->avg.load_sum, load_sum);

+ runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+ runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+
+ add_positive(&se->avg.runnable_load_sum, runnable_sum);
+ add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+
if (se->on_rq) {
- add_positive(&cfs_rq->runnable_load_avg, load_avg);
- add_positive(&cfs_rq->runnable_load_sum, load_sum);
+ add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
}
}

@@ -3372,23 +3455,6 @@ static inline void cfs_rq_util_change(st
}
}

-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
@@ -3529,22 +3595,6 @@ static inline void update_load_avg(struc
update_tg_load_avg(cfs_rq, 0);
}

-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_load_avg += se->avg.load_avg;
- cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
-}
-
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
-}
-
#ifndef CONFIG_64BIT
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
@@ -3614,7 +3664,7 @@ void remove_entity_load_avg(struct sched

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
- return cfs_rq->runnable_load_avg;
+ return cfs_rq->avg.runnable_load_avg;
}

static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3790,8 +3840,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ update_cfs_group(se);
enqueue_entity_load_avg(cfs_rq, se);
- update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);

if (flags & ENQUEUE_WAKEUP)
@@ -3897,7 +3947,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);

- update_cfs_shares(se);
+ update_cfs_group(se);

/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -4080,7 +4130,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
* Ensure that runnable average is periodically updated.
*/
update_load_avg(cfs_rq, curr, UPDATE_TG);
- update_cfs_shares(curr);
+ update_cfs_group(curr);

#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4998,7 +5048,7 @@ enqueue_task_fair(struct rq *rq, struct
break;

update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_shares(se);
+ update_cfs_group(se);
}

if (!se)
@@ -5057,7 +5107,7 @@ static void dequeue_task_fair(struct rq
break;

update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_shares(se);
+ update_cfs_group(se);
}

if (!se)
@@ -7122,7 +7172,7 @@ static inline bool cfs_rq_is_decayed(str
if (cfs_rq->avg.util_sum)
return false;

- if (cfs_rq->runnable_load_sum)
+ if (cfs_rq->avg.runnable_load_sum)
return false;

return true;
@@ -9595,7 +9645,7 @@ int sched_group_set_shares(struct task_g
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
- update_cfs_shares(se);
+ update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -396,6 +396,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
+ unsigned long runnable_weight;
unsigned int nr_running, h_nr_running;

u64 exec_clock;
@@ -422,8 +423,6 @@ struct cfs_rq {
* CFS load tracking
*/
struct sched_avg avg;
- u64 runnable_load_sum;
- unsigned long runnable_load_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif