Re: [PATCH 1/3] sched/fair: Peter's shares_type patch

From: Tejun Heo
Date: Fri May 05 2017 - 11:30:38 EST


Hello,

On Fri, May 05, 2017 at 12:40:27PM +0200, Vincent Guittot wrote:
> In fact, if TA is the only task in the cgroup, tg->load_avg ==
> cfs_rq->tg_load_avg_contrib.
> For shares_avg, tg_weight == cfs_rq->runnable_load_avg and shares =
> cfs_rq->runnable_load_avg * tg->shares / cfs_rq->runnable_load_avg =
> tg->shares = 1024.
> Then, because of rounding in pelt computation, child
> cfs_rq->runnable_load_avg can something be not null (around 2 or 3)
> when idle so groupentity and root cfs_rq stays around 1024

I see. This is the minor calculation errors and the "replacing local
term" adjustment combining to yield a wild result.

> For shares_runnable, it should be
>
> group_entity->runnable_load_avg = cfs_rq->runnable_load_avg *
> group_entity->avg.load_avg / cfs_rq->avg.load_avg

Yeah, that could be one way to calculate the value while avoiding the
artifacts. Hmmm... IIUC, replacing the local contribution with the
current one is to ensure that we at least calculate with the current
term on the local queue. This makes sense for weight and shares but
as you pointed out it doesn't make sense to replace local base with
runnable when the base is expected to be sum of load_avgs. How about
something like the following?

---
kernel/sched/fair.c | 101 +++++++++++++++++++++++++---------------------------
1 file changed, 50 insertions(+), 51 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2636,26 +2636,58 @@ account_entity_dequeue(struct cfs_rq *cf
cfs_rq->nr_running--;
}

+enum shares_type {
+ shares_runnable,
+ shares_avg,
+ shares_weight,
+};
+
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ enum shares_type shares_type)
{
- long tg_weight, load, shares;
+ long tg_weight, tg_shares, load, load_base, shares;

- /*
- * This really should be: cfs_rq->avg.load_avg, but instead we use
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- * the shares for small weight interactive tasks.
- */
- load = scale_load_down(cfs_rq->load.weight);
+ tg_shares = READ_ONCE(tg->shares);
+
+ switch (shares_type) {
+ case shares_runnable:
+ /*
+ * Instead of the correct cfs_rq->avg.load_avg we use
+ * cfs_rq->runnable_load_avg, which does not include the
+ * blocked load.
+ */
+ load = cfs_rq->runnable_load_avg;
+ load_base = cfs_rq->avg.load_avg;
+ break;
+
+ case shares_avg:
+ load = load_base = cfs_rq->avg.load_avg;
+ break;
+
+ case shares_weight:
+ /*
+ * Instead of the correct cfs_rq->avg.load_avg we use
+ * cfs_rq->load.weight, which is its upper bound. This helps
+ * ramp up the shares for small weight interactive tasks.
+ */
+ load = load_base = scale_load_down(cfs_rq->load.weight);
+ break;
+ }

tg_weight = atomic_long_read(&tg->load_avg);

- /* Ensure tg_weight >= load */
+ /*
+ * This ensures the sum is up-to-date for this CPU, in case of the other
+ * two approximations it biases the sum towards their value and in case
+ * of (near) UP ensures the division ends up <= 1.
+ */
tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += load;
+ tg_weight += load_base;

- shares = (tg->shares * load);
+ shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;

@@ -2671,15 +2703,11 @@ static long calc_cfs_shares(struct cfs_r
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static inline long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, enum shares_type shares_type)
{
return tg->shares;
}
@@ -2721,7 +2749,7 @@ static void update_cfs_shares(struct sch
if (likely(se->load.weight == tg->shares))
return;
#endif
- shares = calc_cfs_shares(cfs_rq, tg);
+ shares = calc_cfs_shares(cfs_rq, tg, shares_weight);

reweight_entity(cfs_rq_of(se), se, shares);
}
@@ -3078,39 +3106,10 @@ static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
-
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
-
- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
-
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
+ long load, delta;

+ load = scale_load_down(calc_cfs_shares(gcfs_rq, gcfs_rq->tg,
+ shares_avg));
delta = load - se->avg.load_avg;

/* Nothing to update */