Re: [PATCH 4/6 v7] sched: propagate load during synchronous attach/detach

From: Peter Zijlstra
Date: Wed Nov 09 2016 - 10:03:56 EST


On Tue, Nov 08, 2016 at 10:53:45AM +0100, Vincent Guittot wrote:
> When a task moves from/to a cfs_rq, we set a flag which is then used to
> propagate the change at parent level (sched_entity and cfs_rq) during
> next update. If the cfs_rq is throttled, the flag will stay pending until
> the cfs_rq is unthrottled.
>
> For propagating the utilization, we copy the utilization of group cfs_rq to
> the sched_entity.
>
> For propagating the load, we have to take into account the load of the
> whole task group in order to evaluate the load of the sched_entity.
> Similarly to what was done before the rewrite of PELT, we add a correction
> factor in case the task group's load is greater than its share so it will
> contribute the same load of a task of equal weight.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> ---


I did the below on top, that basically moves code about a bit to reduce
some #ifdef and kills a few comments that I thought were of the:

i++; /* increment by one */

quality.

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2918,6 +2918,26 @@ __update_load_avg(u64 now, int cpu, stru
return decayed;
}

+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@@ -2997,59 +3017,12 @@ void set_task_rq_fair(struct sched_entit
se->avg.last_update_time = n_last_update_time;
}
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */

-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- \
- res = var + val; \
- \
- if (val < 0 && res > var) \
- res = 0; \
- \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
/* Take into account change of utilization of a child task group */
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;

/* Nothing to update */
@@ -3130,22 +3103,17 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq

static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
{
- /* set cfs_rq's flag */
cfs_rq->propagate_avg = 1;
}

static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
{
- /* Get my cfs_rq */
struct cfs_rq *cfs_rq = group_cfs_rq(se);

- /* Nothing to propagate */
if (!cfs_rq->propagate_avg)
return 0;

- /* Clear my cfs_rq's flag */
cfs_rq->propagate_avg = 0;
-
return 1;
}

@@ -3160,28 +3128,51 @@ static inline int propagate_entity_load_
if (!test_and_clear_tg_cfs_propagate(se))
return 0;

- /* Get parent cfs_rq */
cfs_rq = cfs_rq_of(se);

- /* Propagate to parent */
set_tg_cfs_propagate(cfs_rq);

- /* Update utilization */
update_tg_cfs_util(cfs_rq, se);
-
- /* Update load */
update_tg_cfs_load(cfs_rq, se);

return 1;
}
-#else
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
return 0;
}

static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
-#endif
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}

/*
* Unsigned subtract and clamp on underflow.
@@ -3276,8 +3267,7 @@ static inline void update_load_avg(struc
cfs_rq->curr == se, NULL);
}

- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
-
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
decayed |= propagate_entity_load_avg(se);

if (decayed && (flags & UPDATE_TG))
@@ -8993,11 +8983,6 @@ static void detach_entity_cfs_rq(struct
update_load_avg(se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
-
- /*
- * Propagate the detach across the tg tree to make it visible to the
- * root
- */
propagate_entity_cfs_rq(se);
}

@@ -9017,11 +9002,6 @@ static void attach_entity_cfs_rq(struct
update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
-
- /*
- * Propagate the attach across the tg tree to make it visible to the
- * root
- */
propagate_entity_cfs_rq(se);
}