[PATCH 1/3] sched/fair: Peter's shares_type patch
From: Tejun Heo
Date: Thu May 04 2017 - 16:29:32 EST
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
This patch is combination of
http://lkml.kernel.org/r/20170502081905.GA4626@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ http://lkml.kernel.org/r/20170502083009.GA3377@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ build fix & use shares_avg for propagating load_avg instead of runnable
This fixes the propagation problem described in the following while
keeping group se->load_avg.avg in line with the matching
cfs_rq->load_avg.avg.
http://lkml.kernel.org/r/20170424201415.GB14169@xxxxxxxxxxxxxxx
---
kernel/sched/fair.c | 98 +++++++++++++++++++++++++---------------------------
1 file changed, 48 insertions(+), 50 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2636,26 +2636,57 @@ account_entity_dequeue(struct cfs_rq *cf
cfs_rq->nr_running--;
}
+enum shares_type {
+ shares_runnable,
+ shares_avg,
+ shares_weight,
+};
+
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ enum shares_type shares_type)
{
- long tg_weight, load, shares;
+ long tg_weight, tg_shares, load, shares;
- /*
- * This really should be: cfs_rq->avg.load_avg, but instead we use
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- * the shares for small weight interactive tasks.
- */
- load = scale_load_down(cfs_rq->load.weight);
+ tg_shares = READ_ONCE(tg->shares);
+
+ switch (shares_type) {
+ case shares_runnable:
+ /*
+ * Instead of the correct cfs_rq->avg.load_avg we use
+ * cfs_rq->runnable_load_avg, which does not include the
+ * blocked load.
+ */
+ load = cfs_rq->runnable_load_avg;
+ break;
+
+ case shares_avg:
+ load = cfs_rq->avg.load_avg;
+ break;
+
+ case shares_weight:
+ /*
+ * Instead of the correct cfs_rq->avg.load_avg we use
+ * cfs_rq->load.weight, which is its upper bound. This helps
+ * ramp up the shares for small weight interactive tasks.
+ */
+ load = scale_load_down(cfs_rq->load.weight);
+ break;
+ }
tg_weight = atomic_long_read(&tg->load_avg);
- /* Ensure tg_weight >= load */
+ /*
+ * This ensures the sum is up-to-date for this CPU, in case of the other
+ * two approximations it biases the sum towards their value and in case
+ * of (near) UP ensures the division ends up <= 1.
+ */
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
- shares = (tg->shares * load);
+ shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
@@ -2671,15 +2702,11 @@ static long calc_cfs_shares(struct cfs_r
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static inline long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, enum shares_type shares_type)
{
return tg->shares;
}
@@ -2721,7 +2748,7 @@ static void update_cfs_shares(struct sch
if (likely(se->load.weight == tg->shares))
return;
#endif
- shares = calc_cfs_shares(cfs_rq, tg);
+ shares = calc_cfs_shares(cfs_rq, tg, shares_weight);
reweight_entity(cfs_rq_of(se), se, shares);
}
@@ -3078,39 +3105,10 @@ static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
-
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
-
- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
-
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
+ long load, delta;
+ load = scale_load_down(calc_cfs_shares(gcfs_rq, gcfs_rq->tg,
+ shares_avg));
delta = load - se->avg.load_avg;
/* Nothing to update */