[PATCH v27 01/10] sched: Rework pick_next_task() and prev_balance() to avoid stale prev references

From: John Stultz

Date: Sat Apr 04 2026 - 01:37:15 EST


Historically, the prev value from __schedule() was the rq->curr.
This prev value is passed down through numerous functions, and
used in the class scheduler implementations. The fact that
prev was on_cpu until the end of __schedule(), meant it was
stable across the rq lock drops that the class->pick_next_task()
and ->balance() implementations often do.

However, with proxy-exec, the prev passed to functions called
by __schedule() is rq->donor, which may not be the same as
rq->curr and may not be on_cpu, this makes the prev value
potentially unstable across rq lock drops.

A recently found issue with proxy-exec, is when we begin doing
return migration from try_to_wake_up(), its possible we may be
waking up the rq->donor. When we do this, we proxy_resched_idle()
to put_prev_set_next() setting the rq->donor to rq->idle, allowing
the rq->donor to be return migrated and allowed to run.

This however runs into trouble, as on another cpu we might be in
the middle of calling __schedule(). Conceptually the rq lock is
held for the majority of the time, but in calling pick_next_task()
its possible the class->pick_next_task() handler or the
->balance() call may briefly drop the rq lock. This opens a
window for try_to_wake_up() to wake and return migrate the
rq->donor before the class logic reacquires the rq lock.

Unfortunately pick_next_task() and prev_balance() pass in a prev
argument, to which we pass rq->donor. However this prev value can
now become stale and incorrect across a rq lock drop.

So, to correct this, rework the pick_next_task() and
prev_balance() calls so that they do not take a "prev" argument.

Also rework the class ->pick_next_task() and ->balance()
implementations to drop the prev argument, and in the cases
where it was used, and have the class functions reference
rq->donor directly, and not save the value across rq lock drops
so that we don't end up with a stale references.

Signed-off-by: John Stultz <jstultz@xxxxxxxxxx>
---
Cc: Joel Fernandes <joelagnelf@xxxxxxxxxx>
Cc: Qais Yousef <qyousef@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Valentin Schneider <vschneid@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Zimuzo Ezeozue <zezeozue@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Will Deacon <will@xxxxxxxxxx>
Cc: Waiman Long <longman@xxxxxxxxxx>
Cc: Boqun Feng <boqun.feng@xxxxxxxxx>
Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx>
Cc: Metin Kaya <Metin.Kaya@xxxxxxx>
Cc: Xuewen Yan <xuewen.yan94@xxxxxxxxx>
Cc: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
Cc: Suleiman Souhlal <suleiman@xxxxxxxxxx>
Cc: kuyo chang <kuyo.chang@xxxxxxxxxxxx>
Cc: hupu <hupu.gm@xxxxxxxxx>
Cc: kernel-team@xxxxxxxxxxx
---
kernel/sched/core.c | 37 ++++++++++++++++++-------------------
kernel/sched/deadline.c | 8 +++++++-
kernel/sched/fair.c | 9 +++++++--
kernel/sched/idle.c | 2 +-
kernel/sched/rt.c | 8 +++++++-
kernel/sched/sched.h | 10 ++++------
kernel/sched/stop_task.c | 2 +-
7 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c15c9865299e7..9c8a769a6d109 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5907,10 +5907,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}

-static void prev_balance(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
+static void prev_balance(struct rq *rq, struct rq_flags *rf)
{
- const struct sched_class *start_class = prev->sched_class;
+ const struct sched_class *start_class = rq->donor->sched_class;
const struct sched_class *class;

/*
@@ -5922,7 +5921,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
* a runnable task of @class priority or higher.
*/
for_active_class_range(class, start_class, &idle_sched_class) {
- if (class->balance && class->balance(rq, prev, rf))
+ if (class->balance && class->balance(rq, rf))
break;
}
}
@@ -5931,7 +5930,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
const struct sched_class *class;
@@ -5948,28 +5947,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
+ if (likely(!sched_class_above(rq->donor->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_queued)) {

- p = pick_next_task_fair(rq, prev, rf);
+ p = pick_next_task_fair(rq, rf);
if (unlikely(p == RETRY_TASK))
goto restart;

/* Assume the next prioritized class is idle_sched_class */
if (!p) {
p = pick_task_idle(rq, rf);
- put_prev_set_next_task(rq, prev, p);
+ put_prev_set_next_task(rq, rq->donor, p);
}

return p;
}

restart:
- prev_balance(rq, prev, rf);
+ prev_balance(rq, rf);

for_each_active_class(class) {
if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev, rf);
+ p = class->pick_next_task(rq, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
if (p)
@@ -5979,7 +5978,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (unlikely(p == RETRY_TASK))
goto restart;
if (p) {
- put_prev_set_next_task(rq, prev, p);
+ put_prev_set_next_task(rq, rq->donor, p);
return p;
}
}
@@ -6032,7 +6031,7 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f
static void queue_core_balance(struct rq *rq);

static struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+pick_next_task(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
struct task_struct *next, *p, *max;
@@ -6045,7 +6044,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
bool need_sync;

if (!sched_core_enabled(rq))
- return __pick_next_task(rq, prev, rf);
+ return __pick_next_task(rq, rf);

cpu = cpu_of(rq);

@@ -6058,7 +6057,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
rq->core_pick = NULL;
rq->core_dl_server = NULL;
- return __pick_next_task(rq, prev, rf);
+ return __pick_next_task(rq, rf);
}

/*
@@ -6082,7 +6081,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
goto out_set_next;
}

- prev_balance(rq, prev, rf);
+ prev_balance(rq, rf);

smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
@@ -6264,7 +6263,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}

out_set_next:
- put_prev_set_next_task(rq, prev, next);
+ put_prev_set_next_task(rq, rq->donor, next);
if (rq->core->core_forceidle_count && next == rq->idle)
queue_core_balance(rq);

@@ -6487,10 +6486,10 @@ static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
static inline void sched_core_cpu_dying(unsigned int cpu) {}

static struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+pick_next_task(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
- return __pick_next_task(rq, prev, rf);
+ return __pick_next_task(rq, rf);
}

#endif /* !CONFIG_SCHED_CORE */
@@ -7038,7 +7037,7 @@ static void __sched notrace __schedule(int sched_mode)

pick_again:
assert_balance_callbacks_empty(rq);
- next = pick_next_task(rq, rq->donor, &rf);
+ next = pick_next_task(rq, &rf);
rq->next_class = next->sched_class;
if (sched_proxy_exec()) {
struct task_struct *prev_donor = rq->donor;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27359a1e995f9..7352506208287 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2509,8 +2509,14 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}

-static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+static int balance_dl(struct rq *rq, struct rq_flags *rf)
{
+ /*
+ * Note, rq->donor may change during rq lock drops,
+ * so don't re-use prev across lock drops
+ */
+ struct task_struct *p = rq->donor;
+
if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
/*
* This is OK, because current is on_cpu, which avoids it being
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 597ce5b718d26..4a6669c517dae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9153,14 +9153,19 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);

struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+pick_next_task_fair(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
struct sched_entity *se;
- struct task_struct *p;
+ struct task_struct *p, *prev;
int new_tasks;

again:
+ /*
+ * Re-read rq->donor at the top as it may have
+ * changed across a rq lock drop
+ */
+ prev = rq->donor;
p = pick_task_fair(rq, rf);
if (!p)
goto idle;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a83be0c834ddb..ff39120d723a9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -462,7 +462,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int flags)
}

static int
-balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+balance_idle(struct rq *rq, struct rq_flags *rf)
{
return WARN_ON_ONCE(1);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4e5f1957b91b1..3fd03a836731e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1596,8 +1596,14 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}

-static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+static int balance_rt(struct rq *rq, struct rq_flags *rf)
{
+ /*
+ * Note, rq->donor may change during rq lock drops,
+ * so don't re-use p across lock drops
+ */
+ struct task_struct *p = rq->donor;
+
if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
/*
* This is OK, because current is on_cpu, which avoids it being
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9594355a36811..8ee82b03a8a10 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2550,7 +2550,7 @@ struct sched_class {
/*
* schedule/pick_next_task/prev_balance: rq->lock
*/
- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ int (*balance)(struct rq *rq, struct rq_flags *rf);

/*
* schedule/pick_next_task: rq->lock
@@ -2561,12 +2561,11 @@ struct sched_class {
*
* next = pick_task();
* if (next) {
- * put_prev_task(prev);
+ * put_prev_task(rq->donor);
* set_next_task_first(next);
* }
*/
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct rq_flags *rf);

/*
* sched_change:
@@ -2790,8 +2789,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_queued > 0;
}

-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct rq_flags *rf);
extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);

#define SCA_CHECK 0x01
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index f95798baddebb..c909ca0d8c87c 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -16,7 +16,7 @@ select_task_rq_stop(struct task_struct *p, int cpu, int flags)
}

static int
-balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+balance_stop(struct rq *rq, struct rq_flags *rf)
{
return sched_stop_runnable(rq);
}
--
2.53.0.1213.gd9a14994de-goog