Re: [BUG almost bisected] Splat in dequeue_rt_stack() and build error

From: Paul E. McKenney
Date: Thu Oct 03 2024 - 15:12:19 EST


On Thu, Oct 03, 2024 at 08:50:37PM +0200, Peter Zijlstra wrote:
> On Thu, Oct 03, 2024 at 09:04:30AM -0700, Paul E. McKenney wrote:
> > On Thu, Oct 03, 2024 at 04:22:40PM +0200, Peter Zijlstra wrote:
> > > On Thu, Oct 03, 2024 at 05:45:47AM -0700, Paul E. McKenney wrote:
> > >
> > > > I ran 100*TREE03 for 18 hours each, and got 23 instances of *something*
> > > > happening (and I need to suppress stalls on the repeat). One of the
> > > > earlier bugs happened early, but sadly not this one.
> > >
> > > Damn, I don't have the amount of CPU hours available you mention in your
> > > later email. I'll just go up the rounds to 20 minutes and see if
> > > something wants to go bang before I have to shut down the noise
> > > pollution for the day...
> >
> > Indeed, this was one reason I was soliciting debug patches. ;-)
>
> Sooo... I was contemplating if something like the below might perhaps
> help some. It's a bit of a mess (I'll try and clean up if/when it
> actually proves to work), but it compiles and survives a hand full of 1m
> runs.

Thank you very much! I will give it a spin.

Unless you tell me otherwise, I will allow the current test to complete
(about 12 hours from now), collect any data from it, then start this one.

> I'll try and give it more runs tomorrow when I can power up the big
> machines again -- unless you've already told me it's crap by then :-)

18-hour runs here, so even if I immediately kill the old run and start the
new one, I won't know until 6AM Pacific Time on Friday at the earliest. ;-)

Thanx, Paul

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 43e453ab7e20..1fe850788195 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7010,20 +7010,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
> }
> EXPORT_SYMBOL(default_wake_function);
>
> -void __setscheduler_prio(struct task_struct *p, int prio)
> +const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
> {
> if (dl_prio(prio))
> - p->sched_class = &dl_sched_class;
> - else if (rt_prio(prio))
> - p->sched_class = &rt_sched_class;
> + return &dl_sched_class;
> +
> + if (rt_prio(prio))
> + return &rt_sched_class;
> +
> #ifdef CONFIG_SCHED_CLASS_EXT
> - else if (task_should_scx(p))
> - p->sched_class = &ext_sched_class;
> + if (task_should_scx(p))
> + return &ext_sched_class;
> #endif
> - else
> - p->sched_class = &fair_sched_class;
>
> - p->prio = prio;
> + return &fair_sched_class;
> }
>
> #ifdef CONFIG_RT_MUTEXES
> @@ -7069,7 +7069,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> {
> int prio, oldprio, queued, running, queue_flag =
> DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
> - const struct sched_class *prev_class;
> + const struct sched_class *prev_class, *next_class;
> struct rq_flags rf;
> struct rq *rq;
>
> @@ -7127,6 +7127,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> queue_flag &= ~DEQUEUE_MOVE;
>
> prev_class = p->sched_class;
> + next_class = __setscheduler_class(p, prio);
> +
> + if (prev_class != next_class && p->se.sched_delayed)
> + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> +
> queued = task_on_rq_queued(p);
> running = task_current(rq, p);
> if (queued)
> @@ -7164,7 +7169,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> p->rt.timeout = 0;
> }
>
> - __setscheduler_prio(p, prio);
> + p->sched_class = next_class;
> + p->prio = prio;
> +
> check_class_changing(rq, p, prev_class);
>
> if (queued)
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ab497fafa7be..c157d4860a3b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13177,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> - /*
> - * Since this is called after changing class, this is a little weird
> - * and we cannot use DEQUEUE_DELAYED.
> - */
> - if (p->se.sched_delayed) {
> - /* First, dequeue it from its new class' structures */
> - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
> - /*
> - * Now, clean up the fair_sched_class side of things
> - * related to sched_delayed being true and that wasn't done
> - * due to the generic dequeue not using DEQUEUE_DELAYED.
> - */
> - finish_delayed_dequeue_entity(&p->se);
> - p->se.rel_deadline = 0;
> - __block_task(rq, p);
> - }
> }
>
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b1c3588a8f00..fba524c81c63 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3797,7 +3797,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
>
> extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
> extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
> -extern void __setscheduler_prio(struct task_struct *p, int prio);
> +extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
> extern void set_load_weight(struct task_struct *p, bool update_load);
> extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
> extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index aa70beee9895..0470bcc3d204 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p,
> {
> int oldpolicy = -1, policy = attr->sched_policy;
> int retval, oldprio, newprio, queued, running;
> - const struct sched_class *prev_class;
> + const struct sched_class *prev_class, *next_class;
> struct balance_callback *head;
> struct rq_flags rf;
> int reset_on_fork;
> @@ -706,6 +706,12 @@ int __sched_setscheduler(struct task_struct *p,
> queue_flags &= ~DEQUEUE_MOVE;
> }
>
> + prev_class = p->sched_class;
> + next_class = __setscheduler_class(p, newprio);
> +
> + if (prev_class != next_class && p->se.sched_delayed)
> + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> +
> queued = task_on_rq_queued(p);
> running = task_current(rq, p);
> if (queued)
> @@ -713,11 +719,10 @@ int __sched_setscheduler(struct task_struct *p,
> if (running)
> put_prev_task(rq, p);
>
> - prev_class = p->sched_class;
> -
> if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
> __setscheduler_params(p, attr);
> - __setscheduler_prio(p, newprio);
> + p->sched_class = next_class;
> + p->prio = newprio;
> }
> __setscheduler_uclamp(p, attr);
> check_class_changing(rq, p, prev_class);