[RFC][PATCH 2/2] sched: proxy-exec: Add allow/prevent_migration hooks in the sched classes for proxy_tag_curr

From: John Stultz

Date: Wed Mar 04 2026 - 01:40:25 EST

Currently proxy_tag_curr() calls task dequeue and task enqueue
functions (on lock owners we want to run on behalf of a waiting
donor) in order to force the owner task we are going to run
to be removed from any sched_class pushable lists. This avoids
crashes that could happen if the task being run ended up being
migrated to another cpu.

The dequeue/enqueue pair is sort of an ugly hack though, so
replace these with more focused prevent_migration and
allow_migration function pointers to prevent and then to later
allow it to be migratable after the blocked donor has finished
running it on its behalf.

This patch was inspired from discussion around a similar RFC
patch by: zhidao su <suzhidao@xxxxxxxxxx>

Which highlighted the inefficiency of the dequeue/enqueue pair:
https://lore.kernel.org/lkml/20260303115718.278608-1-soolaugust@xxxxxxxxx/

Reported-by: zhidao su <suzhidao@xxxxxxxxxx>
Closes: https://lore.kernel.org/lkml/20260303115718.278608-1-soolaugust@xxxxxxxxx/
Signed-off-by: John Stultz <jstultz@xxxxxxxxxx>
---
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Valentin Schneider <vschneid@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Joel Fernandes <joelagnelf@xxxxxxxxxx>
Cc: Qais Yousef <qyousef@xxxxxxxxxxx>
Cc: Xuewen Yan <xuewen.yan94@xxxxxxxxx>
Cc: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Cc: Suleiman Souhlal <suleiman@xxxxxxxxxx>
Cc: kuyo chang <kuyo.chang@xxxxxxxxxxxx>
Cc: hupu <hupu.gm@xxxxxxxxx>
Cc: zhidao su <suzhidao@xxxxxxxxxx>
Cc: soolaugust@xxxxxxxxx
Cc: kernel-team@xxxxxxxxxxx
---
kernel/sched/core.c | 18 +++++++++++++-----
kernel/sched/deadline.c | 34 ++++++++++++++++++++++++++--------
kernel/sched/rt.c | 28 +++++++++++++++++++++++-----
kernel/sched/sched.h | 3 +++
4 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55bafb1585eca..174a3177a3a6b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6712,11 +6712,19 @@ static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
* However, the chosen/donor task *and* the mutex owner form an
* atomic pair wrt push/pull.
*
- * Make sure owner we run is not pushable. Unfortunately we can
- * only deal with that by means of a dequeue/enqueue cycle. :-/
+ * Make sure owner we run is not pushable.
*/
- dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
- enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
+ if (owner->sched_class->prevent_migration)
+ owner->sched_class->prevent_migration(rq, owner);
+}
+
+static inline void proxy_untag_prev(struct rq *rq, struct task_struct *prev)
+{
+ if (!sched_proxy_exec())
+ return;
+
+ if (prev->sched_class->allow_migration)
+ prev->sched_class->allow_migration(rq, prev);
}

/*
@@ -6874,7 +6882,7 @@ static void __sched notrace __schedule(int sched_mode)
if (!task_current_donor(rq, next))
proxy_tag_curr(rq, next);
if (!(!preempt && prev_state) && prev != prev_donor)
- proxy_tag_curr(rq, prev);
+ proxy_untag_prev(rq, prev);

/*
* The membarrier system call requires each architecture
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d08b004293234..6bd6f0682e6c6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2289,6 +2289,28 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
task_non_contending(dl_se, true);
}

+static inline void __allow_migration_dl(struct rq *rq, struct task_struct *p)
+{
+ if (dl_server(&p->dl))
+ return;
+
+ if (task_is_blocked(p))
+ return;
+
+ if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void allow_migration_dl(struct rq *rq, struct task_struct *p)
+{
+ __allow_migration_dl(rq, p);
+}
+
+static void prevent_migration_dl(struct rq *rq, struct task_struct *p)
+{
+ dequeue_pushable_dl_task(rq, p);
+}
+
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
if (is_dl_boosted(&p->dl)) {
@@ -2339,14 +2361,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)

enqueue_dl_entity(&p->dl, flags);

- if (dl_server(&p->dl))
- return;
-
- if (task_is_blocked(p))
- return;
-
- if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
- enqueue_pushable_dl_task(rq, p);
+ __allow_migration_dl(rq, p);
}

static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -3408,6 +3423,9 @@ DEFINE_SCHED_CLASS(dl) = {
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,

+ .allow_migration = allow_migration_dl,
+ .prevent_migration = prevent_migration_dl,
+
.wakeup_preempt = wakeup_preempt_dl,

.pick_task = pick_task_dl,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f69e1f16d9238..90f1c62e1f827 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1424,6 +1424,25 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
enqueue_top_rt_rq(&rq->rt);
}

+static void __allow_migration_rt(struct rq *rq, struct task_struct *p)
+{
+ if (task_is_blocked(p))
+ return;
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+}
+
+static void allow_migration_rt(struct rq *rq, struct task_struct *p)
+{
+ __allow_migration_rt(rq, p);
+}
+
+static void prevent_migration_rt(struct rq *rq, struct task_struct *p)
+{
+ dequeue_pushable_task(rq, p);
+}
+
/*
* Adding/removing a task to/from a priority array:
*/
@@ -1440,11 +1459,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)

enqueue_rt_entity(rt_se, flags);

- if (task_is_blocked(p))
- return;
-
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- enqueue_pushable_task(rq, p);
+ __allow_migration_rt(rq, p);
}

static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -2583,6 +2598,9 @@ DEFINE_SCHED_CLASS(rt) = {
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,

+ .allow_migration = allow_migration_rt,
+ .prevent_migration = prevent_migration_rt,
+
.wakeup_preempt = wakeup_preempt_rt,

.pick_task = pick_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca4..5c3eb8b28ebd3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2575,6 +2575,9 @@ struct sched_class {
*/
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);

+ void (*allow_migration)(struct rq *rq, struct task_struct *p);
+ void (*prevent_migration)(struct rq *rq, struct task_struct *p);
+
/*
* ttwu_do_activate: rq->lock
* wake_up_new_task: task_rq_lock
--
2.53.0.473.g4a7958ca14-goog