[PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup

From: K Prateek Nayak
Date: Thu Oct 10 2024 - 04:47:26 EST


Since sched_delayed tasks remain on "rq->cfs_tasks" list even after
blocking, they can be migrated from one runqueue to another in a delayed
state by the load balancer. When they are eventually requeued or woken
up on the same CPU via the try_to_wake_up() path, the eventual
activation is clueless about the migration. This trips the PSI
accounting since, in terms of enqueue flags, PSI only considers the
following as a wakeup for PSI accounting:

(flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)

This can lead inconsistent PSI task state splat similar to:

psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4 # Without partial fixup from this patch
psi: inconsistent task state! task=... cpu=... psi_flags=0 clear=4 set=1 # With partial fixup from this patch

Tracking the PSI changes along with task's state revealed the following
series of events:

psi_task_switch: psi_flags=4 clear=4 set=1 # sched_delayed is set to 1
psi_dequeue: psi_flags=1 clear=1 set=0 # dequeued for migration
psi_enqueue: psi_flags=0 clear=0 set=4 # enqueued after migration
psi_enqueue: psi_flags=4 clear=1 set=4 # wakeup after migration
psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=1 set=4

Moving psi_enqueue() to after "p->sched_class->enqueue_task()" and
skipping enqueue until the delayed task is actually woken up (referred
to partial fixup previously) changes the above scenario to the
following:

psi_task_switch: psi_flags=4 clear=4 set=1 # sched_delayed is set to 1
psi_dequeue: psi_flags=1 clear=1 set=0 # dequeued for migration
psi_enqueue: psi_flags=0 clear=0 set=0 # enqueued after migration, sched delayed
psi_enqueue: psi_flags=0 clear=1 set=4 # wakeup of delayed task
psi: inconsistent task state! task=... cpu=... psi_flags=0 clear=1 set=4

psi_enqueue() tries to clear the TSK_IOWAIT since it believes the task
has not migrated due to the lack of ENQUEUE_MIGRATED flag in case of a
requeue or a full wakeup on "p->wake_cpu", but in-fact TSK_IOWAIT was
cleared during dequeue for migration and was never set again.

Define "DELAYED_MIGRATED" and set it in "p->migration_flags" when a
delayed task is migrated. This flag is consumed when the delayed entity
is finally woken up, and psi_enqueue() is notified of the migration.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@xxxxxx/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@xxxxxxxxxxxxx/
Link: https://lore.kernel.org/lkml/f82def74-a64a-4a05-c8d4-4eeb3e03d0c0@xxxxxxx/
Tested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/core.c | 19 ++++++++++++++++++-
kernel/sched/sched.h | 1 +
kernel/sched/stats.h | 10 ++++++++++
3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52be38021ebb..1a353fa69a54 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2009,12 +2009,19 @@ unsigned long get_wchan(struct task_struct *p)

void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ bool wakee_not_migrated = (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED);
+
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);

if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
- psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
+
+ /* Notify PSI that the task was migrated in a delayed state before wakeup. */
+ if ((p->migration_flags & DELAYED_MIGRATED) && !task_on_rq_migrating(p)) {
+ wakee_not_migrated = false;
+ p->migration_flags &= ~DELAYED_MIGRATED;
+ }
}

p->sched_class->enqueue_task(rq, p, flags);
@@ -2023,6 +2030,8 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
* ->sched_delayed.
*/
uclamp_rq_inc(rq, p);
+ if (!(flags & ENQUEUE_RESTORE))
+ psi_enqueue(p, wakee_not_migrated);

if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
@@ -2042,6 +2051,14 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeue(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
+
+ /*
+ * Indicate that a sched_delayed task was migrated.
+ * enqueue_task() needs this for correct accounting
+ * when the delayed task eventually wakes up.
+ */
+ if (p->se.sched_delayed && task_on_rq_migrating(p))
+ p->migration_flags |= DELAYED_MIGRATED;
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..2dc2c4cb4f5f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1326,6 +1326,7 @@ static inline int cpu_of(struct rq *rq)
}

#define MDF_PUSH 0x01
+#define DELAYED_MIGRATED 0x02 /* Task was migrated when in DELAYED_DEQUEUE state */

static inline bool is_migration_disabled(struct task_struct *p)
{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..06a2c6d3ec1e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled))
return;

+ /*
+ * Delayed task is not ready to run yet!
+ * Wait for a requeue before accounting.
+ */
+ if (p->se.sched_delayed)
+ return;
+
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;

@@ -148,6 +155,9 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
if (static_branch_likely(&psi_disabled))
return;

+ /* Delayed task can only be dequeued for migration. */
+ WARN_ON_ONCE(p->se.sched_delayed && sleep);
+
/*
* A voluntary sleep is a dequeue followed by a task switch. To
* avoid walking all ancestors twice, psi_task_switch() handles
--
2.34.1