Re: [PATCH] sched/deadline: runtime overrun and deadline miss signals

From: Mathieu Poirier
Date: Thu Nov 09 2017 - 13:31:06 EST


Good day Claudio,


On 31 October 2017 at 06:28, Claudio Scordino <claudio@xxxxxxxxxxxxxxx> wrote:
> From: Juri Lelli <juri.lelli@xxxxxxxxx>
>
> This patch adds the possibility to get the delivery of two signals
> whenever there is a runtime overrun or a deadline miss, respectively.
> The request is done through the sched_flags field within the sched_attr
> structure.
>
> Forward port of https://lkml.org/lkml/2009/10/16/170
>
> Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxx>
> Signed-off-by: Claudio Scordino <claudio@xxxxxxxxxxxxxxx>
> Signed-off-by: Luca Abeni <luca.abeni@xxxxxxxxxxxxxxx>
> Cc: Tommaso Cucinotta <tommaso.cucinotta@xxxxxxxx>
> CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> CC: Ingo Molnar <mingo@xxxxxxxxxx>
> CC: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Mathieu Poirier <mathieu.poirier@xxxxxxxxxx>
> ---
> include/linux/sched.h | 8 ++++++++
> include/uapi/linux/sched.h | 2 ++
> kernel/sched/core.c | 3 ++-
> kernel/sched/deadline.c | 13 +++++++++++++
> kernel/time/posix-cpu-timers.c | 26 ++++++++++++++++++++++++++
> 5 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0f897df..285d1b4 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -473,11 +473,19 @@ struct sched_dl_entity {
> * has not been executed yet. This flag is useful to avoid race
> * conditions between the inactive timer handler and the wakeup
> * code.
> + *
> + * @dl_overrun tells if the task asked to be informed about budget
> + * overruns.
> + *
> + * @dl_dmiss tells if the task asked to be informed about deadline
> + * misses.
> */
> int dl_throttled : 1;
> int dl_boosted : 1;
> int dl_yielded : 1;
> int dl_non_contending : 1;
> + int dl_overrun : 1;
> + int dl_dmiss : 1;
>
> /*
> * Bandwidth enforcement timer. Each -deadline task has its
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index e2a6c7b..544be0c 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -48,5 +48,7 @@
> */
> #define SCHED_FLAG_RESET_ON_FORK 0x01
> #define SCHED_FLAG_RECLAIM 0x02
> +#define SCHED_FLAG_DL_OVERRUN 0x04
> +#define SCHED_FLAG_DL_DMISS 0x08
>
> #endif /* _UAPI_LINUX_SCHED_H */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 97227df..d79df7a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4041,7 +4041,8 @@ static int __sched_setscheduler(struct task_struct *p,
> }
>
> if (attr->sched_flags &
> - ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
> + ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM |
> + SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_DL_DMISS))
> return -EINVAL;
>
> /*
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 8d1b946..8c1aa61 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1154,6 +1154,17 @@ static void update_curr_dl(struct rq *rq)
> throttle:
> if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
> dl_se->dl_throttled = 1;
> +
> + /*
> + * If requested, inform the user about deadline misses and/or
> + * runtime overruns.
> + */
> + if (unlikely(dl_se->flags & SCHED_FLAG_DL_DMISS &&
> + dl_time_before(dl_se->deadline, rq_clock(rq))))
> + dl_se->dl_dmiss = 1;
> + if (dl_se->flags & SCHED_FLAG_DL_OVERRUN)
> + dl_se->dl_overrun = 1;

Here we automatically imply that a task that is yielding to another
one has overruned its time budget. To me those are two different
things and may mislead applications looking out for the signal.

> +
> __dequeue_task_dl(rq, curr, 0);
> if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
> enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
> @@ -2565,6 +2576,8 @@ void __dl_clear_params(struct task_struct *p)
> dl_se->dl_throttled = 0;
> dl_se->dl_yielded = 0;
> dl_se->dl_non_contending = 0;
> + dl_se->dl_overrun = 0;
> + dl_se->dl_dmiss = 0;
> }
>
> bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index 8585ad6..f3616c5 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -13,6 +13,7 @@
> #include <linux/tick.h>
> #include <linux/workqueue.h>
> #include <linux/compat.h>
> +#include <linux/sched/deadline.h>
>
> #include "posix-timers.h"
>
> @@ -790,6 +791,22 @@ check_timers_list(struct list_head *timers,
> return 0;
> }
>
> +static inline void check_dl_overrun(struct task_struct *tsk)
> +{
> + if (tsk->dl.dl_overrun) {
> + tsk->dl.dl_overrun = 0;
> + pr_info("runtime overrun: %s[%d]\n",
> + tsk->comm, task_pid_nr(tsk));
> + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
> + }
> + if (tsk->dl.dl_dmiss) {
> + tsk->dl.dl_dmiss = 0;
> + pr_info("scheduling deadline miss: %s[%d]\n",
> + tsk->comm, task_pid_nr(tsk));
> + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);

This works well when either one of the dl_overrun or dl_dmiss is set,
but when both are set not so much. We can either leave things as they
are now and let user space figure it out, make them mutually exclusive
or use SIGUSR1 and SIGUSR2. This issue may already have been raised
when the first revision was sent out in 2009.

Tested-by: Mathieu Poirier <mathieu.poirier@xxxxxxxxxx>


> + }
> +}
> +
> /*
> * Check for any per-thread CPU timers that have fired and move them off
> * the tsk->cpu_timers[N] list onto the firing list. Here we update the
> @@ -803,6 +820,9 @@ static void check_thread_timers(struct task_struct *tsk,
> u64 expires;
> unsigned long soft;
>
> + if (dl_task(tsk))
> + check_dl_overrun(tsk);
> +
> /*
> * If cputime_expires is zero, then there are no active
> * per thread CPU timers.
> @@ -905,6 +925,9 @@ static void check_process_timers(struct task_struct *tsk,
> struct task_cputime cputime;
> unsigned long soft;
>
> + if (dl_task(tsk))
> + check_dl_overrun(tsk);
> +
> /*
> * If cputimer is not running, then there are no active
> * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
> @@ -1110,6 +1133,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
> return 1;
> }
>
> + if (dl_task(tsk) && (tsk->dl.dl_overrun || tsk->dl.dl_dmiss))
> + return 1;
> +
> return 0;
> }
>
> --
> 2.7.4
>