Re: [RFC][PATCH 2/2] sched: proxy-exec: Add allow/prevent_migration hooks in the sched classes for proxy_tag_curr

From: Peter Zijlstra

Date: Wed Mar 04 2026 - 08:21:28 EST


On Wed, Mar 04, 2026 at 06:38:10AM +0000, John Stultz wrote:

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 55bafb1585eca..174a3177a3a6b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6712,11 +6712,19 @@ static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
> * However, the chosen/donor task *and* the mutex owner form an
> * atomic pair wrt push/pull.
> *
> - * Make sure owner we run is not pushable. Unfortunately we can
> - * only deal with that by means of a dequeue/enqueue cycle. :-/
> + * Make sure owner we run is not pushable.
> */
> - dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
> - enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
> + if (owner->sched_class->prevent_migration)
> + owner->sched_class->prevent_migration(rq, owner);
> +}
> +
> +static inline void proxy_untag_prev(struct rq *rq, struct task_struct *prev)
> +{
> + if (!sched_proxy_exec())
> + return;
> +
> + if (prev->sched_class->allow_migration)
> + prev->sched_class->allow_migration(rq, prev);
> }
>
> /*
> @@ -6874,7 +6882,7 @@ static void __sched notrace __schedule(int sched_mode)
> if (!task_current_donor(rq, next))
> proxy_tag_curr(rq, next);
> if (!(!preempt && prev_state) && prev != prev_donor)
> - proxy_tag_curr(rq, prev);
> + proxy_untag_prev(rq, prev);
>
> /*
> * The membarrier system call requires each architecture

Yeah, not a fan in this form.

I really don't think we need new class callbacks for this. Esp. not
named like this, which is quite terrible.

Note how migrate_disable() and migrate_enable() use ->set_cpus_allowed()
and are both very much about preventing and allowing migration.

Also note how set_next_task() / put_prev_task() already very much do
what you want; except they only work for the donor.

Further note that the only reason this proxy_tag_curr() thing lives
where it does is because it depends on the value of current. However if
you do this, you no longer have that constraint and then there is a much
saner place for all this.


So I think I prefer (ab)using the migrate_disable() infrastructure,
simply because it would avoid having to do an (indirect) class call
entirely -- but looking at how RT/DL handle this, I think there's bugs
there.

Specifically, something like pick_next_pushable_task() should never
return something that has ->migration_disabled set, it should continue
iterating the list until it finds one that hasn't.


Anyway, without having tested anything at all, how crazy would something
like this be?


---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b571e640372..79b606e5d7cd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2413,6 +2413,10 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (likely(!p->migration_disabled))
return;

+ if ((p->migration_flags & MDF_PROXY) &&
+ p->migration_disabled == 1)
+ return;
+
if (p->cpus_ptr != &p->cpus_mask)
return;

@@ -6651,11 +6655,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
struct mutex *mutex;

/* Follow blocked_on chain. */
- for (p = donor; task_is_blocked(p); p = owner) {
- mutex = p->blocked_on;
- /* Something changed in the chain, so pick again */
- if (!mutex)
- return NULL;
+ for (p = donor; (mutex = p->blocked_on); p = owner) {
/*
* By taking mutex->wait_lock we hold off concurrent mutex_unlock()
* and ensure @owner sticks around.
@@ -6756,21 +6756,18 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
}
#endif /* SCHED_PROXY_EXEC */

-static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
+static inline void set_proxy_task(struct task_struct *p)
{
- if (!sched_proxy_exec())
- return;
- /*
- * pick_next_task() calls set_next_task() on the chosen task
- * at some point, which ensures it is not push/pullable.
- * However, the chosen/donor task *and* the mutex owner form an
- * atomic pair wrt push/pull.
- *
- * Make sure owner we run is not pushable. Unfortunately we can
- * only deal with that by means of a dequeue/enqueue cycle. :-/
- */
- dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
- enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
+ WARN_ON_ONCE(p->migration_flags & MDF_PROXY);
+ p->migration_flags |= MDF_PROXY;
+ p->migration_disabled++;
+}
+
+static inline void put_proxy_task(struct task_struct *p)
+{
+ WARN_ON_ONCE(!(p->migration_flags & MDF_PROXY));
+ p->migration_flags &= ~MDF_PROXY;
+ p->migration_disabled--;
}

/*
@@ -6900,14 +6897,22 @@ static void __sched notrace __schedule(int sched_mode)

pick_again:
next = pick_next_task(rq, rq->donor, &rf);
- rq_set_donor(rq, next);
rq->next_class = next->sched_class;
- if (unlikely(task_is_blocked(next))) {
- next = find_proxy_task(rq, next, &rf);
- if (!next)
- goto pick_again;
- if (next == rq->idle)
- goto keep_resched;
+ if (sched_proxy_exec()) {
+ if (prev != rq->donor)
+ put_proxy_task(prev);
+
+ rq_set_donor(rq, next);
+ if (next->blocked_on) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next)
+ goto pick_again;
+ if (next == rq->idle)
+ goto keep_resched;
+ }
+
+ if (next != rq->donor)
+ set_proxy_task(next);
}
picked:
clear_tsk_need_resched(prev);
@@ -6924,9 +6929,6 @@ static void __sched notrace __schedule(int sched_mode)
*/
RCU_INIT_POINTER(rq->curr, next);

- if (!task_current_donor(rq, next))
- proxy_tag_curr(rq, next);
-
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6960,10 +6962,6 @@ static void __sched notrace __schedule(int sched_mode)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- /* In case next was already curr but just got blocked_donor */
- if (!task_current_donor(rq, next))
- proxy_tag_curr(rq, next);
-
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq, NULL);
hrtick_schedule_exit(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fd36ae390520..8222e108be73 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1368,6 +1368,7 @@ static inline int cpu_of(struct rq *rq)
}

#define MDF_PUSH 0x01
+#define MDF_PROXY 0x02

static inline bool is_migration_disabled(struct task_struct *p)
{