Re: ~90s shutdown delay with v6.19 and PREEMPT_RT
From: Bert Karwatzki
Date: Tue Feb 24 2026 - 07:47:27 EST
I've done some more monitoring with this debug patch which monitors
cgroup_task_dead() and the function which calls it finish task switch.
To avoid too many messages some printk()s are filtered by command name
(a previous patch showed systemd to be the problematic process):
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b04910..7bb6931a4d86 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6990,25 +6990,32 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
{
struct css_set *cset;
unsigned long flags;
+ printk(KERN_INFO "%s 0: task = %px\n", __func__, tsk);
spin_lock_irqsave(&css_set_lock, flags);
+ printk(KERN_INFO "%s 1: task = %px\n", __func__, tsk);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
+ printk(KERN_INFO "%s 2: task = %px\n", __func__, tsk);
css_set_move_task(tsk, cset, NULL, false);
+ printk(KERN_INFO "%s 3: task = %px\n", __func__, tsk);
cset->nr_tasks--;
/* matches the signal->live check in css_task_iter_advance() */
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+ printk(KERN_INFO "%s 4: task = %px\n", __func__, tsk);
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
+ printk(KERN_INFO "%s 5: task = %px\n", __func__, tsk);
WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
+ printk(KERN_INFO "%s 6: task = %px\n", __func__, tsk);
spin_unlock_irqrestore(&css_set_lock, flags);
}
@@ -7029,9 +7036,11 @@ static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
{
struct llist_node *lnode;
struct task_struct *task, *next;
+ printk(KERN_INFO "%s:\n", __func__);
lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ printk(KERN_INFO "%s: %px %s", __func__, task, task->comm);
do_cgroup_task_dead(task);
put_task_struct(task);
}
@@ -7050,6 +7059,7 @@ static void __init cgroup_rt_init(void)
void cgroup_task_dead(struct task_struct *task)
{
+ printk(KERN_INFO "%s: task = %px (%s)\n", __func__, task, task->comm);
get_task_struct(task);
llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
@@ -7059,6 +7069,7 @@ static void __init cgroup_rt_init(void) {}
void cgroup_task_dead(struct task_struct *task)
{
+ printk(KERN_INFO "%s: task = %px (%s)\n", __func__, task, task->comm);
do_cgroup_task_dead(task);
}
#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 854984967fe2..73e477d8cf1a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5078,6 +5078,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
unsigned int prev_state;
+ if (!strcmp(prev->comm, "systemd"))
+ printk(KERN_INFO "%s 0: %px\n", __func__, prev);
/*
* The previous task will have left us with a preempt_count of 2
@@ -5144,15 +5146,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
}
if (unlikely(prev_state == TASK_DEAD)) {
+ printk(KERN_INFO "%s 1: %px (%s)\n", __func__, prev, prev->comm);
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
+ printk(KERN_INFO "%s 2: %px (%s)\n", __func__, prev, prev->comm);
/*
* sched_ext_dead() must come before cgroup_task_dead() to
* prevent cgroups from being removed while its member tasks are
* visible to SCX schedulers.
*/
sched_ext_dead(prev);
+ printk(KERN_INFO "%s 3: %px (%s)\n", __func__, prev, prev->comm);
cgroup_task_dead(prev);
/* Task is done with its stack. */
@@ -5202,6 +5207,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
+ if (!strcmp(prev->comm, "systemd"))
+ printk(KERN_INFO "%s 0: %px\n", __func__, prev);
prepare_task_switch(rq, prev, next);
/*
I also tried monitoring the schedule_tail(), which is one of the two function calling
finish_task_switch(), but that did get rid of the delay.
The result from this is:
2026-02-24T13:13:21.739185+01:00 localhost 12,32039,34364889,-,caller=T2955;reboot
[...]
Here the delay section begins (all message here have comm == "systemd"):
2026-02-24T13:13:38.124013+01:00 localhost 6,45456,50748843,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:13:38.124234+01:00 localhost 6,45457,50748851,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:13:38.271984+01:00 localhost 6,45458,50896237,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:13:38.272255+01:00 localhost 6,45459,50896244,-,caller=T52;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:13:38.272556+01:00 localhost 6,45460,50896287,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:13:38.272759+01:00 localhost 6,45461,50896289,-,caller=T51;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:13:38.273225+01:00 localhost 6,45462,50896303,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:13:38.273406+01:00 localhost 6,45463,50896305,-,caller=T51;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:13:38.273819+01:00 localhost 6,45464,50896481,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:13:38.274025+01:00 localhost 6,45465,50896484,-,caller=T0;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:13:38.274465+01:00 localhost 6,45466,50896801,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:13:38.274635+01:00 localhost 6,45467,50896804,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:13:38.275034+01:00 localhost 6,45468,50897328,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:13:38.275254+01:00 localhost 6,45469,50897331,-,caller=T2077;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:13:38.275680+01:00 localhost 6,45470,50897600,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:13:38.275886+01:00 localhost 6,45471,50897602,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:13:38.276300+01:00 localhost 6,45472,50897936,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:13:38.276487+01:00 localhost 6,45473,50897939,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:13:38.276907+01:00 localhost 6,45474,50898225,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:13:38.279144+01:00 localhost 6,45475,50898228,-,caller=T68;finish_task_switch 0: ffff97fd40884300
[...] 1505 similar lines removed
2026-02-24T13:14:54.000427+01:00 localhost 6,46981,126614226,-,caller=T44;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:14:54.000967+01:00 localhost 6,46982,126614420,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:14:54.001060+01:00 localhost 6,46983,126614422,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:14:54.001562+01:00 localhost 6,46984,126614462,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:14:54.001666+01:00 localhost 6,46985,126614463,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:14:54.002182+01:00 localhost 6,46986,126614533,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:14:54.002296+01:00 localhost 6,46987,126614534,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:14:54.002811+01:00 localhost 6,46988,126615219,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:14:54.002910+01:00 localhost 6,46989,126615221,-,caller=T42;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:14:54.003402+01:00 localhost 6,46990,126616376,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:14:54.003510+01:00 localhost 6,46991,126616378,-,caller=T0;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:14:54.004034+01:00 localhost 6,46992,126621884,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:14:54.004125+01:00 localhost 6,46993,126621887,-,caller=T0;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:14:54.004645+01:00 localhost 6,46994,126625311,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:14:54.004734+01:00 localhost 6,46995,126625314,-,caller=T235;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:14:54.013405+01:00 localhost 6,46996,126637601,-,caller=T1;context_switch 0: ffff97fd40884300
2026-02-24T13:14:54.013614+01:00 localhost 6,46997,126637603,-,caller=T0;finish_task_switch 0: ffff97fd40884300
2026-02-24T13:14:54.014165+01:00 localhost 6,46998,126638229,-,caller=T1573;context_switch 0: ffff97fd46050000
2026-02-24T13:14:54.014266+01:00 localhost 6,46999,126638232,-,caller=T44;finish_task_switch 0: ffff97fd46050000
2026-02-24T13:14:54.014774+01:00 localhost 6,47000,126638469,-,caller=T1573;context_switch 0: ffff97fd46050000 END
So there's something strange going on here with the scheduler.
Bert Karwatzki