Re: [PATCH 1/3] sched/fair: Peter's shares_type patch
From: Vincent Guittot
Date: Fri May 05 2017 - 06:40:52 EST
On 4 May 2017 at 22:29, Tejun Heo <tj@xxxxxxxxxx> wrote:
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>
> This patch is combination of
>
> http://lkml.kernel.org/r/20170502081905.GA4626@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
> + http://lkml.kernel.org/r/20170502083009.GA3377@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
> + build fix & use shares_avg for propagating load_avg instead of runnable
>
> This fixes the propagation problem described in the following while
> keeping group se->load_avg.avg in line with the matching
> cfs_rq->load_avg.avg.
>
> http://lkml.kernel.org/r/20170424201415.GB14169@xxxxxxxxxxxxxxx
>
> ---
> kernel/sched/fair.c | 98 +++++++++++++++++++++++++---------------------------
> 1 file changed, 48 insertions(+), 50 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2636,26 +2636,57 @@ account_entity_dequeue(struct cfs_rq *cf
> cfs_rq->nr_running--;
> }
>
> +enum shares_type {
> + shares_runnable,
> + shares_avg,
> + shares_weight,
> +};
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
> # ifdef CONFIG_SMP
> -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
> +static long
> +calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
> + enum shares_type shares_type)
> {
> - long tg_weight, load, shares;
> + long tg_weight, tg_shares, load, shares;
>
> - /*
> - * This really should be: cfs_rq->avg.load_avg, but instead we use
> - * cfs_rq->load.weight, which is its upper bound. This helps ramp up
> - * the shares for small weight interactive tasks.
> - */
> - load = scale_load_down(cfs_rq->load.weight);
> + tg_shares = READ_ONCE(tg->shares);
> +
> + switch (shares_type) {
> + case shares_runnable:
> + /*
> + * Instead of the correct cfs_rq->avg.load_avg we use
> + * cfs_rq->runnable_load_avg, which does not include the
> + * blocked load.
> + */
> + load = cfs_rq->runnable_load_avg;
> + break;
> +
> + case shares_avg:
> + load = cfs_rq->avg.load_avg;
> + break;
> +
> + case shares_weight:
> + /*
> + * Instead of the correct cfs_rq->avg.load_avg we use
> + * cfs_rq->load.weight, which is its upper bound. This helps
> + * ramp up the shares for small weight interactive tasks.
> + */
> + load = scale_load_down(cfs_rq->load.weight);
> + break;
> + }
>
> tg_weight = atomic_long_read(&tg->load_avg);
>
> - /* Ensure tg_weight >= load */
> + /*
> + * This ensures the sum is up-to-date for this CPU, in case of the other
> + * two approximations it biases the sum towards their value and in case
> + * of (near) UP ensures the division ends up <= 1.
> + */
> tg_weight -= cfs_rq->tg_load_avg_contrib;
> tg_weight += load;
The above is correct for shares_avg and shares_weight but it's not
correct for shares_runnable because runnable_load_avg is a subset of
load_avg.
To highlight this, i have run a single periodic task TA which runs 3ms
with a period of 7.777ms. In case you wonder why 7.777ms, it is just
to be sure to not be synced with the tick (4ms on my platform) and
cover more case but this doesn't have any impact in this example.
TA is the only task, nothing else runs on the CPU and the background
activity is near nothing and doesn't impact the result below
When TA runs in the root domain, {root
cfs_rq,TA}->avg.{load_avg,util_avg} are in the range [390..440] and
root cfs_rq->avg.runnable_load_avg toogles between 0 when cfs_rq is
idle and [390..440] when TA is running
When TA runs in a dedicated cgroup, {root
cfs_rq,TA}->avg.{load_avg,util_avg} remains in the range [390..440]
but root cfs_rq->avg.runnable_load_avg is in the range [1012..1036]
and often stays in this range whereas TA is sleeping. I have checked
that I was tracing correctly all PELT metrics update and that root
cfs_rq->avg.runnable_load_avg is really not null and not the side
effect of a missing trace.
In fact, if TA is the only task in the cgroup, tg->load_avg ==
cfs_rq->tg_load_avg_contrib.
For shares_avg, tg_weight == cfs_rq->runnable_load_avg and shares =
cfs_rq->runnable_load_avg * tg->shares / cfs_rq->runnable_load_avg =
tg->shares = 1024.
Then, because of rounding in pelt computation, child
cfs_rq->runnable_load_avg can something be not null (around 2 or 3)
when idle so groupentity and root cfs_rq stays around 1024
For shares_runnable, it should be
group_entity->runnable_load_avg = cfs_rq->runnable_load_avg *
group_entity->avg.load_avg / cfs_rq->avg.load_avg
And i think that shares_avg is also wrong and should be something like:
group_entity->avg.load_avg = cfs_rq->avg.load_avg *
group_entity->load.weight / cfs_rq->load.weight
Then we have to take into account the fact that when we propagate
load, group_entity->load.weight and cfs_rq->load.weight of prev cpu
and new cpu have not been updated yet. I still need to think a bit
more on that impact and how to compensate that when propagating load
>
> - shares = (tg->shares * load);
> + shares = (tg_shares * load);
> if (tg_weight)
> shares /= tg_weight;
>
> @@ -2671,15 +2702,11 @@ static long calc_cfs_shares(struct cfs_r
> * case no task is runnable on a CPU MIN_SHARES=2 should be returned
> * instead of 0.
> */
> - if (shares < MIN_SHARES)
> - shares = MIN_SHARES;
> - if (shares > tg->shares)
> - shares = tg->shares;
> -
> - return shares;
> + return clamp_t(long, shares, MIN_SHARES, tg_shares);
> }
> # else /* CONFIG_SMP */
> -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
> +static inline long
> +calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, enum shares_type shares_type)
> {
> return tg->shares;
> }
> @@ -2721,7 +2748,7 @@ static void update_cfs_shares(struct sch
> if (likely(se->load.weight == tg->shares))
> return;
> #endif
> - shares = calc_cfs_shares(cfs_rq, tg);
> + shares = calc_cfs_shares(cfs_rq, tg, shares_weight);
>
> reweight_entity(cfs_rq_of(se), se, shares);
> }
> @@ -3078,39 +3105,10 @@ static inline void
> update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> struct cfs_rq *gcfs_rq = group_cfs_rq(se);
> - long delta, load = gcfs_rq->avg.load_avg;
> -
> - /*
> - * If the load of group cfs_rq is null, the load of the
> - * sched_entity will also be null so we can skip the formula
> - */
> - if (load) {
> - long tg_load;
> -
> - /* Get tg's load and ensure tg_load > 0 */
> - tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
> -
> - /* Ensure tg_load >= load and updated with current load*/
> - tg_load -= gcfs_rq->tg_load_avg_contrib;
> - tg_load += load;
> -
> - /*
> - * We need to compute a correction term in the case that the
> - * task group is consuming more CPU than a task of equal
> - * weight. A task with a weight equals to tg->shares will have
> - * a load less or equal to scale_load_down(tg->shares).
> - * Similarly, the sched_entities that represent the task group
> - * at parent level, can't have a load higher than
> - * scale_load_down(tg->shares). And the Sum of sched_entities'
> - * load must be <= scale_load_down(tg->shares).
> - */
> - if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
> - /* scale gcfs_rq's load into tg's shares*/
> - load *= scale_load_down(gcfs_rq->tg->shares);
> - load /= tg_load;
> - }
> - }
> + long load, delta;
>
> + load = scale_load_down(calc_cfs_shares(gcfs_rq, gcfs_rq->tg,
> + shares_avg));
> delta = load - se->avg.load_avg;
>
> /* Nothing to update */