[RFC][PATCH 08/14] sched/fair: Rewrite PELT migration propagation

From: Peter Zijlstra
Date: Fri May 12 2017 - 13:21:59 EST


When an entity migrates in (or out) of a runqueue, we need to add (or
remove) its contribution from the entire PELT hierarchy, because even
non-runnable entities are included in the load average sums.

In order to do this we have some propagation logic that updates the
PELT tree, however the way it 'propagates' the runnable (or load)
change is (more or less):

tg->weight * grq->avg.load_avg
ge->avg.load_avg = ------------------------------
tg->load_avg

But that is the expression for ge->weight, and per the definition of
load_avg:

ge->avg.load_avg := ge->weight * ge->avg.runnable_avg

That destroys the runnable_avg (by setting it to 1) we wanted to
propagate.

Instead directly propagate runnable_sum.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/debug.c | 2
kernel/sched/fair.c | 186 ++++++++++++++++++++++++++++-----------------------
kernel/sched/sched.h | 9 +-
3 files changed, 112 insertions(+), 85 deletions(-)

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -523,6 +523,8 @@ void print_cfs_rq(struct seq_file *m, in
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
+ cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3163,11 +3163,77 @@ void set_task_rq_fair(struct sched_entit
se->avg.last_update_time = n_last_update_time;
}

-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ *
+ * And per (1) we have:
+ *
+ * ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ * grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ * d(runnable_avg) += se->avg.load_avg / se->load.weight
+ * (5)
+ * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;

/* Nothing to update */
@@ -3183,102 +3249,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}

-/* Take into account change of load of a child task group */
static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
-
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
+ long runnable_sum = gcfs_rq->prop_runnable_sum;
+ long load_avg;
+ s64 load_sum;

- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
-
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
+ if (!runnable_sum)
+ return;

- delta = load - se->avg.load_avg;
+ gcfs_rq->prop_runnable_sum = 0;

- /* Nothing to update */
- if (!delta)
- return;
+ load_sum = (s64)se_weight(se) * runnable_sum;
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);

- /* Set new sched_entity's load */
- se->avg.load_avg = load;
- se->avg.load_sum = LOAD_AVG_MAX;
+ add_positive(&se->avg.load_sum, runnable_sum);
+ add_positive(&se->avg.load_avg, load_avg);

- /* Update parent cfs_rq load */
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->avg.load_avg, load_avg);
+ add_positive(&cfs_rq->avg.load_sum, load_sum);

- /*
- * If the sched_entity is already enqueued, we also have to update the
- * runnable load avg.
- */
if (se->on_rq) {
- /* Update parent cfs_rq runnable_load_avg */
- add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->runnable_load_avg, load_avg);
+ add_positive(&cfs_rq->runnable_load_sum, load_sum);
}
}

-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
-{
- cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
- if (!cfs_rq->propagate_avg)
- return 0;
-
- cfs_rq->propagate_avg = 0;
- return 1;
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
}

/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *gcfs_rq;

if (entity_is_task(se))
return 0;

- if (!test_and_clear_tg_cfs_propagate(se))
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
return 0;

+ gcfs_rq->propagate = 0;
+
cfs_rq = cfs_rq_of(se);

- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);

- update_tg_cfs_util(cfs_rq, se);
- update_tg_cfs_load(cfs_rq, se);
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);

return 1;
}
@@ -3302,7 +3325,7 @@ static inline bool skip_blocked_update(s
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
- if (gcfs_rq->propagate_avg)
+ if (gcfs_rq->propagate)
return false;

/*
@@ -3322,7 +3345,7 @@ static inline int propagate_entity_load_
return 0;
}

-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}

#endif /* CONFIG_FAIR_GROUP_SCHED */

@@ -3386,7 +3409,7 @@ static inline void cfs_rq_util_change(st
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
- unsigned long removed_load = 0, removed_util = 0;
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;

@@ -3396,6 +3419,7 @@ update_cfs_rq_load_avg(u64 now, struct c
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);

@@ -3411,7 +3435,7 @@ update_cfs_rq_load_avg(u64 now, struct c
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);

- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);

decayed = 1;
}
@@ -3444,7 +3468,8 @@ static void attach_entity_load_avg(struc
cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);

cfs_rq_util_change(cfs_rq);
}
@@ -3464,7 +3489,8 @@ static void detach_entity_load_avg(struc
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

cfs_rq_util_change(cfs_rq);
}
@@ -3582,6 +3608,7 @@ void remove_entity_load_avg(struct sched
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}

@@ -9369,9 +9396,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->propagate_avg = 0;
-#endif
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -427,18 +427,19 @@ struct cfs_rq {
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- unsigned long propagate_avg;
-#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
int nr;
unsigned long load_avg;
unsigned long util_avg;
+ unsigned long runnable_sum;
} removed;

#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
+
/*
* h_load = weight * f(tg)
*