Re: ~90s shutdown delay with v6.19 and PREEMPT_RT

From: Bert Karwatzki

Date: Tue Feb 24 2026 - 07:59:20 EST

Am Dienstag, dem 24.02.2026 um 13:44 +0100 schrieb Bert Karwatzki:
> I've done some more monitoring with this debug patch which monitors
> cgroup_task_dead() and the function which calls it finish task switch.
> To avoid too many messages some printk()s are filtered by command name
> (a previous patch showed systemd to be the problematic process):
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 5f0d33b04910..7bb6931a4d86 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6990,25 +6990,32 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
> {
> struct css_set *cset;
> unsigned long flags;
> + printk(KERN_INFO "%s 0: task = %px\n", __func__, tsk);
>
> spin_lock_irqsave(&css_set_lock, flags);
>
> + printk(KERN_INFO "%s 1: task = %px\n", __func__, tsk);
> WARN_ON_ONCE(list_empty(&tsk->cg_list));
> cset = task_css_set(tsk);
> + printk(KERN_INFO "%s 2: task = %px\n", __func__, tsk);
> css_set_move_task(tsk, cset, NULL, false);
> + printk(KERN_INFO "%s 3: task = %px\n", __func__, tsk);
> cset->nr_tasks--;
> /* matches the signal->live check in css_task_iter_advance() */
> if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
> list_add_tail(&tsk->cg_list, &cset->dying_tasks);
>
> + printk(KERN_INFO "%s 4: task = %px\n", __func__, tsk);
> if (dl_task(tsk))
> dec_dl_tasks_cs(tsk);
>
> + printk(KERN_INFO "%s 5: task = %px\n", __func__, tsk);
> WARN_ON_ONCE(cgroup_task_frozen(tsk));
> if (unlikely(!(tsk->flags & PF_KTHREAD) &&
> test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
> cgroup_update_frozen(task_dfl_cgroup(tsk));
>
> + printk(KERN_INFO "%s 6: task = %px\n", __func__, tsk);
> spin_unlock_irqrestore(&css_set_lock, flags);
> }
>
> @@ -7029,9 +7036,11 @@ static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
> {
> struct llist_node *lnode;
> struct task_struct *task, *next;
> + printk(KERN_INFO "%s:\n", __func__);
>
> lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
> llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
> + printk(KERN_INFO "%s: %px %s", __func__, task, task->comm);
> do_cgroup_task_dead(task);
> put_task_struct(task);
> }
> @@ -7050,6 +7059,7 @@ static void __init cgroup_rt_init(void)
>
> void cgroup_task_dead(struct task_struct *task)
> {
> + printk(KERN_INFO "%s: task = %px (%s)\n", __func__, task, task->comm);
> get_task_struct(task);
> llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
> irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
> @@ -7059,6 +7069,7 @@ static void __init cgroup_rt_init(void) {}
>
> void cgroup_task_dead(struct task_struct *task)
> {
> + printk(KERN_INFO "%s: task = %px (%s)\n", __func__, task, task->comm);
> do_cgroup_task_dead(task);
> }
> #endif /* CONFIG_PREEMPT_RT */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 854984967fe2..73e477d8cf1a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5078,6 +5078,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
> struct rq *rq = this_rq();
> struct mm_struct *mm = rq->prev_mm;
> unsigned int prev_state;
> + if (!strcmp(prev->comm, "systemd"))
> + printk(KERN_INFO "%s 0: %px\n", __func__, prev);
>
> /*
> * The previous task will have left us with a preempt_count of 2
> @@ -5144,15 +5146,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
> }
>
> if (unlikely(prev_state == TASK_DEAD)) {
> + printk(KERN_INFO "%s 1: %px (%s)\n", __func__, prev, prev->comm);
> if (prev->sched_class->task_dead)
> prev->sched_class->task_dead(prev);
>
> + printk(KERN_INFO "%s 2: %px (%s)\n", __func__, prev, prev->comm);
> /*
> * sched_ext_dead() must come before cgroup_task_dead() to
> * prevent cgroups from being removed while its member tasks are
> * visible to SCX schedulers.
> */
> sched_ext_dead(prev);
> + printk(KERN_INFO "%s 3: %px (%s)\n", __func__, prev, prev->comm);
> cgroup_task_dead(prev);
>
> /* Task is done with its stack. */
> @@ -5202,6 +5207,8 @@ static __always_inline struct rq *
> context_switch(struct rq *rq, struct task_struct *prev,
> struct task_struct *next, struct rq_flags *rf)
> {
> + if (!strcmp(prev->comm, "systemd"))
> + printk(KERN_INFO "%s 0: %px\n", __func__, prev);
> prepare_task_switch(rq, prev, next);
>
> /*
>
>
> I also tried monitoring the schedule_tail(), which is one of the two function calling
> finish_task_switch(), but that did get rid of the delay.

This is incorrect. Monitoring schedule_tail() does not undo the delayed shutdown, but
the delay probalbly has only 95% reproducability.

>
> The result from this is:
> 2026-02-24T13:13:21.739185+01:00 localhost 12,32039,34364889,-,caller=T2955;reboot
> [...]
> Here the delay section begins (all message here have comm == "systemd"):
> 2026-02-24T13:13:38.124013+01:00 localhost 6,45456,50748843,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.124234+01:00 localhost 6,45457,50748851,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.271984+01:00 localhost 6,45458,50896237,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.272255+01:00 localhost 6,45459,50896244,-,caller=T52;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.272556+01:00 localhost 6,45460,50896287,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.272759+01:00 localhost 6,45461,50896289,-,caller=T51;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.273225+01:00 localhost 6,45462,50896303,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.273406+01:00 localhost 6,45463,50896305,-,caller=T51;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.273819+01:00 localhost 6,45464,50896481,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.274025+01:00 localhost 6,45465,50896484,-,caller=T0;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:13:38.274465+01:00 localhost 6,45466,50896801,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.274635+01:00 localhost 6,45467,50896804,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.275034+01:00 localhost 6,45468,50897328,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.275254+01:00 localhost 6,45469,50897331,-,caller=T2077;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.275680+01:00 localhost 6,45470,50897600,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.275886+01:00 localhost 6,45471,50897602,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.276300+01:00 localhost 6,45472,50897936,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.276487+01:00 localhost 6,45473,50897939,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.276907+01:00 localhost 6,45474,50898225,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:13:38.279144+01:00 localhost 6,45475,50898228,-,caller=T68;finish_task_switch 0: ffff97fd40884300
>
> [...] 1505 similar lines removed
>
> 2026-02-24T13:14:54.000427+01:00 localhost 6,46981,126614226,-,caller=T44;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.000967+01:00 localhost 6,46982,126614420,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.001060+01:00 localhost 6,46983,126614422,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.001562+01:00 localhost 6,46984,126614462,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.001666+01:00 localhost 6,46985,126614463,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.002182+01:00 localhost 6,46986,126614533,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.002296+01:00 localhost 6,46987,126614534,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.002811+01:00 localhost 6,46988,126615219,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.002910+01:00 localhost 6,46989,126615221,-,caller=T42;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.003402+01:00 localhost 6,46990,126616376,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.003510+01:00 localhost 6,46991,126616378,-,caller=T0;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.004034+01:00 localhost 6,46992,126621884,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.004125+01:00 localhost 6,46993,126621887,-,caller=T0;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.004645+01:00 localhost 6,46994,126625311,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.004734+01:00 localhost 6,46995,126625314,-,caller=T235;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.013405+01:00 localhost 6,46996,126637601,-,caller=T1;context_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.013614+01:00 localhost 6,46997,126637603,-,caller=T0;finish_task_switch 0: ffff97fd40884300
> 2026-02-24T13:14:54.014165+01:00 localhost 6,46998,126638229,-,caller=T1573;context_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.014266+01:00 localhost 6,46999,126638232,-,caller=T44;finish_task_switch 0: ffff97fd46050000
> 2026-02-24T13:14:54.014774+01:00 localhost 6,47000,126638469,-,caller=T1573;context_switch 0: ffff97fd46050000 END
>
> So there's something strange going on here with the scheduler.
>
>
> Bert Karwatzki