Re: [REGRESSION] Re: [PATCH 00/24] Complete EEVDF

From: Marcel Ziswiler
Date: Tue Dec 10 2024 - 11:26:47 EST


On Mon, 2024-12-09 at 10:49 +0100, Peter Zijlstra wrote:
>
> Sorry for the delay, I got laid low by snot monsters :/
>
> On Mon, Dec 02, 2024 at 07:46:21PM +0100, Marcel Ziswiler wrote:
>
> > Unfortunately, once I trigger the failure the system is completely dead and won't allow me to dump the
> > trace
> > buffer any longer. So I did the following instead on the serial console terminal:
> >
> > tail -f /sys/kernel/debug/tracing/trace
> >
> > Not sure whether there is any better way to go about this. Plus even so we run the serial console at 1.5
> > megabaud I am not fully sure whether it was able to keep up logging what you are looking for.
>
> Ah, that is unfortunate. There is a ftrace_dump_on_oops option that
> might be of help. And yes, dumping trace buffers over 1m5 serial lines
> is tedious -- been there done that, got a t-shirt and all that.
>
> Still, let me see if perhaps making that WARN in enqueue_dl_entity()
> return makes the whole thing less fatal.
>
> I've included the traceoff_on_warning and ftrace_dump in the code, so
> all you really need to still do is enable the stacktrace option.
>
>    echo 1 > /sys/kernel/debug/tracing/options/stacktrace
>
> > Yes, and do not hesitate to ask for any additional information et. al. we are happy to help. Thanks!
>
> Could I bother you to try again with the below patch?

Sure, here you go.

https://drive.codethink.co.uk/s/HniZCtccDBMHpAK

> There are two new hunks vs the previous one, the hunk in
> enqueue_dl_entity() (the very last bit) will stop tracing and dump the
> buffers when that condition is hit in addition to then aborting the
> double enqueue, hopefully leaving the system is a slightly better state.
>
> The other new hunk is the one for dl_server_stop() (second hunk), while
> going over the code last week, I found that this might be a possible
> hole leading to the observed double enqueue, so fingers crossed.
>
> ---
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 33b4646f8b24..bd1df7612482 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1223,6 +1223,11 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
>   scoped_guard (rq_lock, rq) {
>   struct rq_flags *rf = &scope.rf;
>  
> + if (dl_se == &rq->fair_server) {
> + trace_printk("timer fair server %d throttled %d\n",
> +      cpu_of(rq), dl_se->dl_throttled);
> + }
> +
>   if (!dl_se->dl_throttled || !dl_se->dl_runtime)
>   return HRTIMER_NORESTART;
>  
> @@ -1674,6 +1679,12 @@ void dl_server_start(struct sched_dl_entity *dl_se)
>  
>  void dl_server_stop(struct sched_dl_entity *dl_se)
>  {
> + if (current->dl_server == dl_se) {
> + struct rq *rq = rq_of_dl_se(dl_se);
> + trace_printk("stop fair server %d\n", cpu_of(rq));
> + current->dl_server = NULL;
> + }
> +
>   if (!dl_se->dl_runtime)
>   return;
>  
> @@ -1792,6 +1803,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
>   rq_lock(rq, &rf);
>   }
>  
> + if (dl_se == &rq->fair_server)
> + trace_printk("inactive fair server %d\n", cpu_of(rq));
> +
>   sched_clock_tick();
>   update_rq_clock(rq);
>  
> @@ -1987,6 +2001,12 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
>  static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
>  {
>   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> + struct rq *rq = rq_of_dl_se(dl_se);
> +
> + if (dl_se == &rq->fair_server) {
> + trace_printk("enqueue fair server %d h_nr_running %d\n",
> +      cpu_of(rq), rq->cfs.h_nr_running);
> + }
>  
>   WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
>  
> @@ -1998,6 +2018,12 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
>  static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
>  {
>   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> + struct rq *rq = rq_of_dl_se(dl_se);
> +
> + if (dl_se == &rq->fair_server) {
> + trace_printk("dequeue fair server %d h_nr_running %d\n",
> +      cpu_of(rq), rq->cfs.h_nr_running);
> + }
>  
>   if (RB_EMPTY_NODE(&dl_se->rb_node))
>   return;
> @@ -2012,7 +2038,11 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
>  static void
>  enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
>  {
> - WARN_ON_ONCE(on_dl_rq(dl_se));
> + if (WARN_ON_ONCE(on_dl_rq(dl_se))) {
> + tracing_off();
> + ftrace_dump(DUMP_ALL);
> + return;
> + }
>  
>   update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);