Re: [PATCH 1/2] sched: proxy-exec: Close race causing workqueue work being delayed
From: K Prateek Nayak
Date: Wed Apr 29 2026 - 05:00:38 EST
Hello John,
On 4/29/2026 7:57 AM, John Stultz wrote:
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 8ec3b6d7d718b..6ea74aecc5fbd 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -586,6 +586,7 @@ struct sched_entity {
>> unsigned char sched_delayed;
>> unsigned char rel_deadline;
>> unsigned char custom_slice;
>> + unsigned char sched_proxy;
>> /* hole */
>
> I feel like this is so tied to the blocked_on value, I suspect it
> makes the most sense to have this flag be the low bit of that pointer?
>
> Sort of a blocked_on latch, to signal its really in effect?
>
> Plus it gets cleared automatically on set and clear, so it looks a
> little cleaner.
I had actually started looking into that last night but sleep eventually
got to me! I agree that is much cleaner to do it alongside
p->blocked_on.
>
>
>> @@ -6535,8 +6536,13 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
>> * blocked on a mutex, and we want to keep it on the runqueue
>> * to be selectable for proxy-execution.
>> */
>> - if (!should_block)
>> + if (!should_block) {
>> + guard(raw_spinlock)(&p->blocked_lock);
>> + /* Stable against race */
>> + if (task_is_blocked(p))
>> + WRITE_ONCE(p->se.sched_proxy, 1);
>> return false;
>> + }
>
> So if we double check and find the task isn't blocked anymore, we
> probably shouldn't return early here, no?
>
> Let me take a stab at the bit flag approach and see how it goes.
In case you want to peek at my homework ;-)
(Lightly tested with test-ww_mutex and sched-messaging)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ec3b6d7d718b..e28f5c05a1689 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1237,7 +1237,7 @@ struct task_struct {
struct rt_mutex_waiter *pi_blocked_on;
#endif
- struct mutex *blocked_on; /* lock we're blocked on */
+ unsigned long blocked_on; /* lock we're blocked on */
raw_spinlock_t blocked_lock;
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
@@ -2188,12 +2188,20 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
* evaluated for return-migration before it is run. So if the task is
* blocked_on PROXY_WAKING, return migrate it before running it.
*/
-#define PROXY_WAKING ((struct mutex *)(-1L))
+#define PROXY_WAKING ((unsigned long)(-1L))
+
+#define BO_FLAG_PROXY BIT(0)
+#define BO_FLAGS GENMASK(0, 0)
+
+static inline struct mutex *__get_blocked_on_mutex(unsigned long m)
+{
+ return (struct mutex *)(m & ~BO_FLAGS);
+}
static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
{
lockdep_assert_held_once(&p->blocked_lock);
- return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on;
+ return p->blocked_on == PROXY_WAKING ? NULL : __get_blocked_on_mutex(p->blocked_on);
}
static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2208,8 +2216,11 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
* with a different mutex. Note, setting it to the same
* lock repeatedly is ok.
*/
- WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
- p->blocked_on = m;
+ WARN_ON_ONCE(p->blocked_on &&
+ __get_blocked_on_mutex(p->blocked_on) != m);
+
+ BUG_ON((unsigned long)m & BO_FLAGS);
+ p->blocked_on = (unsigned long)m & ~BO_FLAGS;
}
static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2221,8 +2232,10 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *
* blocked_on relationships, but make sure we are not
* clearing the relationship with a different lock.
*/
- WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
- p->blocked_on = NULL;
+ WARN_ON_ONCE(m && p->blocked_on &&
+ __get_blocked_on_mutex(p->blocked_on) != m &&
+ p->blocked_on != PROXY_WAKING);
+ p->blocked_on = 0;
}
static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2249,7 +2262,12 @@ static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mu
* already set to waking, but make sure we are not changing
* the relationship with a different lock.
*/
- WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
+ WARN_ON_ONCE(m &&
+ __get_blocked_on_mutex(p->blocked_on) != m &&
+ p->blocked_on != PROXY_WAKING);
+
+ /* Always ensure PROXY_WAKING implies BO_FLAG_PROXY. */
+ BUILD_BUG_ON(!(PROXY_WAKING & BO_FLAG_PROXY));
p->blocked_on = PROXY_WAKING;
}
@@ -2259,6 +2277,20 @@ static inline void set_task_blocked_on_waking(struct task_struct *p, struct mute
__set_task_blocked_on_waking(p, m);
}
+static inline void set_task_blocked_on_proxy(struct task_struct *p)
+{
+ guard(raw_spinlock_irqsave)(&p->blocked_lock);
+
+ if (!sched_proxy_exec())
+ return;
+
+ /* Don't add BO_FLAG_PROXY if blocked_on was already cleared */
+ if (!p->blocked_on)
+ return;
+
+ p->blocked_on |= BO_FLAG_PROXY;
+}
+
#else
static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 079802cb61002..936e8d6ee7a0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2177,7 +2177,7 @@ __latent_entropy struct task_struct *copy_process(
lockdep_init_task(p);
- p->blocked_on = NULL; /* not blocked yet */
+ p->blocked_on = 0; /* not blocked yet */
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 49cd5d2171613..a2d61ec40a421 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6535,8 +6535,10 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
* blocked on a mutex, and we want to keep it on the runqueue
* to be selectable for proxy-execution.
*/
- if (!should_block)
+ if (!should_block) {
+ set_task_blocked_on_proxy(p);
return false;
+ }
p->sched_contributes_to_load =
(task_state & TASK_UNINTERRUPTIBLE) &&
@@ -6765,18 +6767,24 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
bool curr_in_chain = false;
int this_cpu = cpu_of(rq);
struct task_struct *p;
- struct mutex *mutex;
int owner_cpu;
/* Follow blocked_on chain. */
- for (p = donor; (mutex = p->blocked_on); p = owner) {
- /* if its PROXY_WAKING, do return migration or run if current */
- if (mutex == PROXY_WAKING) {
- if (task_current(rq, p)) {
- clear_task_blocked_on(p, PROXY_WAKING);
- return p;
+ for (p = donor; task_is_blocked(p); p = owner) {
+ struct mutex *mutex = __get_task_blocked_on(p);
+
+ if (!mutex) {
+ /* if its PROXY_WAKING, do return migration or run if current */
+ if (p->blocked_on == PROXY_WAKING) {
+ if (task_current(rq, p)) {
+ clear_task_blocked_on(p, (struct mutex *)PROXY_WAKING);
+ return p;
+ }
+ goto force_return;
}
- goto force_return;
+
+ /* Something changed in the blocked_on chain; Try again. */
+ return NULL;
}
/*
@@ -7026,13 +7034,15 @@ static void __sched notrace __schedule(int sched_mode)
}
} else if (!preempt && prev_state) {
/*
- * We pass task_is_blocked() as the should_block arg
+ * We pass prev->blocked_on as the should_block arg
* in order to keep mutex-blocked tasks on the runqueue
* for slection with proxy-exec (without proxy-exec
* task_is_blocked() will always be false).
+ *
+ * task_is_blocked() cannot be used directly since the
+ * task has not set BO_FLAG_PROXY yet.
*/
- try_to_block_task(rq, prev, &prev_state,
- !task_is_blocked(prev));
+ try_to_block_task(rq, prev, &prev_state, !prev->blocked_on);
switch_count = &prev->nvcsw;
}
@@ -7043,8 +7053,16 @@ static void __sched notrace __schedule(int sched_mode)
if (sched_proxy_exec()) {
struct task_struct *prev_donor = rq->donor;
+ /*
+ * A wakeup raced with block_task();
+ * Clear blocked_on before running the task
+ * again.
+ */
+ if (unlikely(!prev_state && prev->blocked_on))
+ clear_task_blocked_on(prev, NULL);
+
rq_set_donor(rq, next);
- if (unlikely(next->blocked_on)) {
+ if (unlikely(task_is_blocked(next))) {
next = find_proxy_task(rq, next, &rf);
if (!next) {
zap_balance_callbacks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c95584191d58f..e4a0aeee9bfc0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2390,7 +2390,7 @@ static inline bool task_is_blocked(struct task_struct *p)
if (!sched_proxy_exec())
return false;
- return !!p->blocked_on;
+ return !!(p->blocked_on & BO_FLAG_PROXY);
}
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
---
There is probably a better way to do it by inspecting the flags only in
find_proxy_task() and keep task_is_blocked() as is but I wanted to see
if it can be done as a part of task_ib_blocked() helper since that
seemed cleaner in my head.
I'll let you fiddle with it.
--
Thanks and Regards,
Prateek