[PATCH 4/6 v5] sched: propagate load during synchronous attach/detach

From: Vincent Guittot
Date: Mon Oct 17 2016 - 05:16:19 EST


When a task moves from/to a cfs_rq, we set a flag which is then used to
propagate the change at parent level (sched_entity and cfs_rq) during
next update. If the cfs_rq is throttled, the flag will stay pending until
the cfs_rw is unthrottled.

For propagating the utilization, we copy the utilization of group cfs_rq to
the sched_entity.

For propagating the load, we have to take into account the load of the
whole task group in order to evaluate the load of the sched_entity.
Similarly to what was done before the rewrite of PELT, we add a correction
factor in case the task group's load is greater than its share so it will
contribute the same load of a task of equal weight.

Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
---
kernel/sched/fair.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 1 +
2 files changed, 200 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52cbc92..91fc949 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3031,6 +3031,162 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
}

/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) res, val = (_val); \
+ typeof(*ptr) var = READ_ONCE(*ptr); \
+ res = var + val; \
+ if (res < 0) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more cpu than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ /* set cfs_rq's flag */
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ /* Get my cfs_rq */
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ /* Nothing to propagate */
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ /* Clear my cfs_rq's flag */
+ cfs_rq->propagate_avg = 0;
+
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ /* Get parent cfs_rq */
+ cfs_rq = cfs_rq_of(se);
+
+ /* Propagate to parent */
+ set_tg_cfs_propagate(cfs_rq);
+
+ /* Update utilization */
+ update_tg_cfs_util(cfs_rq, se);
+
+ /* Update load */
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+#else
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+#endif
+
+/*
* Unsigned subtract and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
@@ -3111,6 +3267,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
+ int decayed;

/*
* Track task load average for carrying it to new CPU after migrated, and
@@ -3121,7 +3278,11 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);

- if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
}

@@ -3140,6 +3301,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);

cfs_rq_util_change(cfs_rq);
}
@@ -3159,6 +3321,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);

cfs_rq_util_change(cfs_rq);
}
@@ -8704,6 +8867,22 @@ static void detach_task_cfs_rq(struct task_struct *p)
update_load_avg(se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Propagate the detach across the tg tree to make it visible to the
+ * root
+ */
+ se = se->parent;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+#endif
}

static void attach_entity_cfs_rq(struct sched_entity *se)
@@ -8722,6 +8901,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Propagate the attach across the tg tree to make it visible to the
+ * root
+ */
+ se = se->parent;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+#endif
}

static void attach_task_cfs_rq(struct task_struct *p)
@@ -8783,6 +8978,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f1185c3..47385e2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
--
2.7.4