Re: [PATCH 1/2] sched: eliminate bandwidth race between throttling and distribution

From: Phil Auld
Date: Mon Apr 13 2020 - 10:45:17 EST


Hi Josh,

On Sat, Apr 11, 2020 at 07:00:17PM -0700 Josh Don wrote:
> +[1]pauld@xxxxxxxxxx
>
> On Fri, Apr 10, 2020 at 3:52 PM Josh Don <[2]joshdon@xxxxxxxxxx> wrote:
>
> From: Paul Turner <[3]pjt@xxxxxxxxxx>
>
> There is a race window in which an entity begins throttling before quota
> is added to the pool, but does not finish throttling until after we have
> finished with distribute_cfs_runtime(). This entity is not observed by
> distribute_cfs_runtime() because it was not on the throttled list at the
> time that distribution was running. This race manifests as rare
> period-length statlls for such entities.
>
> Rather than heavy-weight the synchronization with the progress of
> distribution, we can fix this by aborting throttling if bandwidth has
> become available. Otherwise, we immediately add the entity to the
> throttled list so that it can be observed by a subsequent distribution.
>
> Additionally, we can remove the case of adding the throttled entity to
> the head of the throttled list, and simply always add to the tail.
> Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto
> its own pool of runtime. This means that if we do hit the !assign and
> distribute_running case, we know that distribution is about to end.
>
> Signed-off-by: Paul Turner <[4]pjt@xxxxxxxxxx>
> Co-developed-by: Ben Segall <[5]bsegall@xxxxxxxxxx>
> Signed-off-by: Ben Segall <[6]bsegall@xxxxxxxxxx>
> Signed-off-by: Josh Don <[7]joshdon@xxxxxxxxxx>

This looks good to me, thanks for the cc.

Reviewed-by: Phil Auld <pauld@xxxxxxxxxx>

> ---
>  kernel/sched/fair.c | 78 ++++++++++++++++++++++++++-------------------
>  1 file changed, 46 insertions(+), 32 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 02f323b85b6d..3fb986c52825 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4587,17 +4587,15 @@ static inline struct cfs_bandwidth
> *tg_cfs_bandwidth(struct task_group *tg)
>         return &tg->cfs_bandwidth;
>  }
>
> -/* returns 0 on failure to allocate runtime */
> -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +/* returns 0 on failure to allocate runtime, called with cfs_b->lock held
> */
> +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
> +                                  struct cfs_rq *cfs_rq, u64
> target_runtime)
>  {
> -       struct task_group *tg = cfs_rq->tg;
> -       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
>         u64 amount = 0, min_amount;
>
>         /* note: this is a positive sum as runtime_remaining <= 0 */
> -       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->
> runtime_remaining;
> +       min_amount = target_runtime - cfs_rq->runtime_remaining;
>
> -       raw_spin_lock(&cfs_b->lock);
>         if (cfs_b->quota == RUNTIME_INF)
>                 amount = min_amount;
>         else {
> @@ -4609,13 +4607,26 @@ static int assign_cfs_rq_runtime(struct cfs_rq
> *cfs_rq)
>                         cfs_b->idle = 0;
>                 }
>         }
> -       raw_spin_unlock(&cfs_b->lock);
>
>         cfs_rq->runtime_remaining += amount;
>
>         return cfs_rq->runtime_remaining > 0;
>  }
>
> +/* returns 0 on failure to allocate runtime */
> +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +{
> +       int ret;
> +       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> +
> +       raw_spin_lock(&cfs_b->lock);
> +       ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq,
> +                                     sched_cfs_bandwidth_slice());
> +       raw_spin_unlock(&cfs_b->lock);
> +
> +       return ret;
> +}
> +
>  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64
> delta_exec)
>  {
>         /* dock delta_exec before expiring quota (as it could span periods)
> */
> @@ -4704,13 +4715,33 @@ static int tg_throttle_down(struct task_group *tg,
> void *data)
>         return 0;
>  }
>
> -static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  {
>         struct rq *rq = rq_of(cfs_rq);
>         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>         struct sched_entity *se;
>         long task_delta, idle_task_delta, dequeue = 1;
> -       bool empty;
> +
> +       raw_spin_lock(&cfs_b->lock);
> +       /* This will start the period timer if necessary */
> +       if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
> +               /*
> +                * We have raced with bandwidth becoming available, and if
> we
> +                * actually throttled the timer might not unthrottle us for
> an
> +                * entire period. We additionally needed to make sure that
> any
> +                * subsequent check_cfs_rq_runtime calls agree not to
> throttle
> +                * us, as we may commit to do cfs put_prev+pick_next, so we
> ask
> +                * for 1ns of runtime rather than just check cfs_b.
> +                */
> +               dequeue = 0;
> +       } else {
> +               list_add_tail_rcu(&cfs_rq->throttled_list,
> +                                 &cfs_b->throttled_cfs_rq);
> +       }
> +       raw_spin_unlock(&cfs_b->lock);
> +
> +       if (!dequeue)
> +               return false;  /* Throttle no longer required. */
>
>         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>
> @@ -4744,29 +4775,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>         if (!se)
>                 sub_nr_running(rq, task_delta);
>
> -       cfs_rq->throttled = 1;
> -       cfs_rq->throttled_clock = rq_clock(rq);
> -       raw_spin_lock(&cfs_b->lock);
> -       empty = list_empty(&cfs_b->throttled_cfs_rq);
> -
>         /*
> -        * Add to the _head_ of the list, so that an already-started
> -        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime
> is
> -        * not running add to the tail so that later runqueues don't get
> starved.
> +        * Note: distribution will already see us throttled via the
> +        * throttled-list.  rq->lock protects completion.
>          */
> -       if (cfs_b->distribute_running)
> -               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->
> throttled_cfs_rq);
> -       else
> -               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->
> throttled_cfs_rq);
> -
> -       /*
> -        * If we're the first throttled task, make sure the bandwidth
> -        * timer is running.
> -        */
> -       if (empty)
> -               start_cfs_bandwidth(cfs_b);
> -
> -       raw_spin_unlock(&cfs_b->lock);
> +       cfs_rq->throttled = 1;
> +       cfs_rq->throttled_clock = rq_clock(rq);
> +       return true;
>  }
>
>  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> @@ -5121,8 +5136,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq
> *cfs_rq)
>         if (cfs_rq_throttled(cfs_rq))
>                 return true;
>
> -       throttle_cfs_rq(cfs_rq);
> -       return true;
> +       return throttle_cfs_rq(cfs_rq);
>  }
>
>  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> --
> 2.26.0.110.g2183baf09c-goog
>
>
>
> References:
>
> [1] mailto:pauld@xxxxxxxxxx
> [2] mailto:joshdon@xxxxxxxxxx
> [3] mailto:pjt@xxxxxxxxxx
> [4] mailto:pjt@xxxxxxxxxx
> [5] mailto:bsegall@xxxxxxxxxx
> [6] mailto:bsegall@xxxxxxxxxx
> [7] mailto:joshdon@xxxxxxxxxx

--