Re: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization

From: Mathieu Poirier
Date: Sun Mar 26 2017 - 13:35:18 EST


On 23 March 2017 at 21:52, luca abeni <luca.abeni@xxxxxxxxxxxxxxx> wrote:
> From: Luca Abeni <luca.abeni@xxxxxxxxxxxxxxx>
>
> This patch implements a more theoretically sound algorithm for
> tracking active utilization: instead of decreasing it when a
> task blocks, use a timer (the "inactive timer", named after the
> "Inactive" task state of the GRUB algorithm) to decrease the
> active utilization at the so called "0-lag time".
>
> Signed-off-by: Luca Abeni <luca.abeni@xxxxxxxxxxxxxxx>
> Tested-by: Claudio Scordino <claudio@xxxxxxxxxxxxxxx>
> Tested-by: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
> ---
> include/linux/sched.h | 17 ++++
> kernel/sched/core.c | 3 +
> kernel/sched/deadline.c | 208 ++++++++++++++++++++++++++++++++++++++++++++----
> kernel/sched/sched.h | 2 +
> 4 files changed, 215 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d67eee8..952cac8 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -445,16 +445,33 @@ struct sched_dl_entity {
> *
> * @dl_yielded tells if task gave up the CPU before consuming
> * all its available runtime during the last job.
> + *
> + * @dl_non_contending tells if task is inactive while still
> + * contributing to the active utilization. In other words, it
> + * indicates if the inactive timer has been armed and its handler
> + * has not been executed yet. This flag is useful to avoid race
> + * conditions between the inactive timer handler and the wakeup
> + * code.
> */
> int dl_throttled;
> int dl_boosted;
> int dl_yielded;
> + int dl_non_contending;
>
> /*
> * Bandwidth enforcement timer. Each -deadline task has its
> * own bandwidth to be enforced, thus we need one timer per task.
> */
> struct hrtimer dl_timer;
> +
> + /*
> + * Inactive timer, responsible for decreasing the active utilization
> + * at the "0-lag time". When a -deadline task blocks, it contributes
> + * to GRUB's active utilization until the "0-lag time", hence a
> + * timer is needed to decrease the active utilization at the correct
> + * time.
> + */
> + struct hrtimer inactive_timer;
> };
>
> union rcu_special {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 6d6cad9..bf0b0b9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2165,6 +2165,7 @@ void __dl_clear_params(struct task_struct *p)
>
> dl_se->dl_throttled = 0;
> dl_se->dl_yielded = 0;
> + dl_se->dl_non_contending = 0;
> }
>
> /*
> @@ -2196,6 +2197,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>
> RB_CLEAR_NODE(&p->dl.rb_node);
> init_dl_task_timer(&p->dl);
> + init_inactive_task_timer(&p->dl);
> __dl_clear_params(p);
>
> INIT_LIST_HEAD(&p->rt.run_list);
> @@ -2518,6 +2520,7 @@ static int dl_overflow(struct task_struct *p, int policy,
> !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
> __dl_clear(dl_b, p->dl.dl_bw);
> __dl_add(dl_b, new_bw);
> + dl_change_utilization(p, new_bw);
> err = 0;
> } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
> __dl_clear(dl_b, p->dl.dl_bw);
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index cef9adb..86aed82 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -65,6 +65,107 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
> dl_rq->running_bw = 0;
> }
>
> +void dl_change_utilization(struct task_struct *p, u64 new_bw)
> +{
> + if (!task_on_rq_queued(p)) {
> + struct rq *rq = task_rq(p);
> +
> + if (p->dl.dl_non_contending) {
> + sub_running_bw(p->dl.dl_bw, &rq->dl);
> + p->dl.dl_non_contending = 0;
> + /*
> + * If the timer handler is currently running and the
> + * timer cannot be cancelled, inactive_task_timer()
> + * will see that dl_not_contending is not set, and
> + * will not touch the rq's active utilization,
> + * so we are still safe.
> + */
> + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> + put_task_struct(p);
> + }
> + }
> +}
> +
> +static void task_non_contending(struct task_struct *p)
> +{
> + struct sched_dl_entity *dl_se = &p->dl;
> + struct hrtimer *timer = &dl_se->inactive_timer;
> + struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> + struct rq *rq = rq_of_dl_rq(dl_rq);
> + s64 zerolag_time;
> +
> + /*
> + * If this is a non-deadline task that has been boosted,
> + * do nothing
> + */
> + if (dl_se->dl_runtime == 0)
> + return;
> +
> + WARN_ON(hrtimer_active(&dl_se->inactive_timer));
> + WARN_ON(dl_se->dl_non_contending);
> +
> + zerolag_time = dl_se->deadline -
> + div64_long((dl_se->runtime * dl_se->dl_period),
> + dl_se->dl_runtime);
> +
> + /*
> + * Using relative times instead of the absolute "0-lag time"
> + * allows to simplify the code
> + */
> + zerolag_time -= rq_clock(rq);
> +
> + /*
> + * If the "0-lag time" already passed, decrease the active
> + * utilization now, instead of starting a timer
> + */
> + if (zerolag_time < 0) {
> + if (dl_task(p))
> + sub_running_bw(dl_se->dl_bw, dl_rq);
> + if (!dl_task(p) || p->state == TASK_DEAD)
> + __dl_clear_params(p);
> +
> + return;
> + }
> +
> + dl_se->dl_non_contending = 1;
> + get_task_struct(p);
> + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
> +}
> +
> +static void task_contending(struct sched_dl_entity *dl_se)
> +{
> + struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> +
> + /*
> + * If this is a non-deadline task that has been boosted,
> + * do nothing
> + */
> + if (dl_se->dl_runtime == 0)
> + return;
> +
> + if (dl_se->dl_non_contending) {
> + /*
> + * If the timer handler is currently running and the
> + * timer cannot be cancelled, inactive_task_timer()
> + * will see that dl_not_contending is not set, and
> + * will not touch the rq's active utilization,
> + * so we are still safe.
> + */
> + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
> + put_task_struct(dl_task_of(dl_se));
> + dl_se->dl_non_contending = 0;
> + } else {
> + /*
> + * Since "dl_non_contending" is not set, the
> + * task's utilization has already been removed from
> + * active utilization (either when the task blocked,
> + * when the "inactive timer" fired).
> + * So, add it back.
> + */
> + add_running_bw(dl_se->dl_bw, dl_rq);
> + }
> +}
> +
> static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
> {
> struct sched_dl_entity *dl_se = &p->dl;
> @@ -615,10 +716,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
> * The task might have changed its scheduling policy to something
> * different than SCHED_DEADLINE (through switched_from_dl()).
> */
> - if (!dl_task(p)) {
> - __dl_clear_params(p);
> + if (!dl_task(p))
> goto unlock;
> - }
>
> /*
> * The task might have been boosted by someone else and might be in the
> @@ -837,6 +936,49 @@ static void update_curr_dl(struct rq *rq)
> }
> }
>
> +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
> +{
> + struct sched_dl_entity *dl_se = container_of(timer,
> + struct sched_dl_entity,
> + inactive_timer);
> + struct task_struct *p = dl_task_of(dl_se);
> + struct rq_flags rf;
> + struct rq *rq;
> +
> + rq = task_rq_lock(p, &rf);
> +
> + if (!dl_task(p) || p->state == TASK_DEAD) {
> + if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
> + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
> + dl_se->dl_non_contending = 0;
> + }
> + __dl_clear_params(p);
> +
> + goto unlock;
> + }
> + if (dl_se->dl_non_contending == 0)
> + goto unlock;
> +
> + sched_clock_tick();
> + update_rq_clock(rq);
> +
> + sub_running_bw(dl_se->dl_bw, &rq->dl);
> + dl_se->dl_non_contending = 0;
> +unlock:
> + task_rq_unlock(rq, p, &rf);
> + put_task_struct(p);
> +
> + return HRTIMER_NORESTART;
> +}
> +
> +void init_inactive_task_timer(struct sched_dl_entity *dl_se)

To be consistent with the other DL related functions:

s/init_inactive_task_timer(...)/init_dl_inactive_task_timer(...)


> +{
> + struct hrtimer *timer = &dl_se->inactive_timer;
> +
> + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + timer->function = inactive_task_timer;
> +}
> +
> #ifdef CONFIG_SMP
>
> static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
> @@ -969,9 +1111,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
> * we want a replenishment of its runtime.
> */
> if (flags & ENQUEUE_WAKEUP) {
> - struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> -
> - add_running_bw(dl_se->dl_bw, dl_rq);
> + task_contending(dl_se);
> update_dl_entity(dl_se, pi_se);
> }
> else if (flags & ENQUEUE_REPLENISH)
> @@ -1040,7 +1180,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
> * add_running_bw().
> */
> if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
> - add_running_bw(p->dl.dl_bw, &rq->dl);
> + if (flags & ENQUEUE_WAKEUP)
> + task_contending(&p->dl);
> +
> return;
> }
>
> @@ -1065,7 +1207,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
> sub_running_bw(p->dl.dl_bw, &rq->dl);
>
> /*
> - * This check allows to decrease the active utilization in two cases:
> + * This check allows to start the inactive timer (or to immediately
> + * decrease the active utilization, if needed) in two cases:
> * when the task blocks and when it is terminating
> * (p->state == TASK_DEAD). We can handle the two cases in the same
> * way, because from GRUB's point of view the same thing is happening
> @@ -1073,7 +1216,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
> * or "inactive")
> */
> if (flags & DEQUEUE_SLEEP)
> - sub_running_bw(p->dl.dl_bw, &rq->dl);
> + task_non_contending(p);
> }
>
> /*
> @@ -1151,6 +1294,28 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
> return cpu;
> }
>
> +static void migrate_task_rq_dl(struct task_struct *p)
> +{
> + if ((p->state == TASK_WAKING) && (p->dl.dl_non_contending)) {
> + struct rq *rq = task_rq(p);
> +
> + raw_spin_lock(&rq->lock);
> + sub_running_bw(p->dl.dl_bw, &rq->dl);
> + p->dl.dl_non_contending = 0;
> + /*
> + * If the timer handler is currently running and the
> + * timer cannot be cancelled, inactive_task_timer()
> + * will see that dl_not_contending is not set, and
> + * will not touch the rq's active utilization,
> + * so we are still safe.
> + */
> + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> + put_task_struct(p);
> +
> + raw_spin_unlock(&rq->lock);
> + }
> +}
> +
> static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
> {
> /*
> @@ -1792,13 +1957,23 @@ void __init init_sched_dl_class(void)
> static void switched_from_dl(struct rq *rq, struct task_struct *p)
> {
> /*
> - * Start the deadline timer; if we switch back to dl before this we'll
> - * continue consuming our current CBS slice. If we stay outside of
> - * SCHED_DEADLINE until the deadline passes, the timer will reset the
> - * task.
> + * task_non_contending() can start the "inactive timer" (if the 0-lag
> + * time is in the future). If the task switches back to dl before
> + * the "inactive timer" fires, it can continue to consume its current
> + * runtime using its current deadline. If it stays outside of
> + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
> + * will reset the task parameters.
> */
> - if (!start_dl_timer(p))
> - __dl_clear_params(p);
> + if (task_on_rq_queued(p) && p->dl.dl_runtime)
> + task_non_contending(p);
> +
> + /*
> + * We cannot use inactive_task_timer() to invoke sub_running_bw()
> + * at the 0-lag time, because the task could have been migrated
> + * while SCHED_OTHER in the meanwhile.
> + */
> + if (p->dl.dl_non_contending)
> + p->dl.dl_non_contending = 0;
>
> /*
> * Since this might be the only -deadline task on the rq,
> @@ -1817,6 +1992,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
> */
> static void switched_to_dl(struct rq *rq, struct task_struct *p)
> {
> + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> + put_task_struct(p);
>
> /* If p is not queued we will update its parameters at next wakeup. */
> if (!task_on_rq_queued(p))
> @@ -1891,6 +2068,7 @@ const struct sched_class dl_sched_class = {
>
> #ifdef CONFIG_SMP
> .select_task_rq = select_task_rq_dl,
> + .migrate_task_rq = migrate_task_rq_dl,
> .set_cpus_allowed = set_cpus_allowed_dl,
> .rq_online = rq_online_dl,
> .rq_offline = rq_offline_dl,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index caaa7d3..57bb79b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
> dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
> }
>
> +void dl_change_utilization(struct task_struct *p, u64 new_bw);
> extern void init_dl_bw(struct dl_bw *dl_b);
>
> #ifdef CONFIG_CGROUP_SCHED
> @@ -1490,6 +1491,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
> extern struct dl_bandwidth def_dl_bandwidth;
> extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
> extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
> +extern void init_inactive_task_timer(struct sched_dl_entity *dl_se);
>
> unsigned long to_ratio(u64 period, u64 runtime);
>
> --
> 2.7.4
>