Re: [PATCH] sched,workqueue: Use READ_ONCE()/WRITE_ONCE() for wake_cpu accesses

From: Tejun Heo

Date: Mon Mar 30 2026 - 16:26:56 EST


On Mon, Mar 30, 2026 at 10:09:46AM -1000, Tejun Heo wrote:
> Hello, Peter.
>
> On Mon, Mar 30, 2026 at 04:36:29PM +0200, Peter Zijlstra wrote:
> > > Hmmm... it would trigger KCSAN. Maybe a better way to do it is exposing
> > > sched interface that does WRITE_ONCE wrapping? How do you want it resolved?
> >
> > Perhaps a try_to_wake_up() variant that takes a cpumask (see the
> > completely untested code below). But I'm not entirely sure. The current
> > thing very much relies on a bunch of implementation details that aren't
> > guaranteed.
> >
> > The below will 'fake' the task cpumask for a while; and note that the
> > caller has to be careful to provide a mask that only includes tasks that
> > were set in the current task, otherwise 'funny' things will happen.
> > Also, it will only use this mask if it ends up doing CPU selection at
> > all.
> >
> > Ooh, I suppose I should make sure wake_cpu is inside the provided map
> > too...
>
> Does it need to be? I think all select_task_rq() implementations ignore it
> if it's outside allowed. I think all we'd need is cpumask_subset() test to
> reject (and trigger WARN_ON_ONCE()) if the preferred mask is not within the
> allowed.

So, something like the following. Build tested only. If this looks okay,
I'll test and write up proper patches. Thanks.

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a5d3dbc9cdf..79ec09c31920 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1988,6 +1988,8 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_process_prefer(struct task_struct *tsk,
+ const struct cpumask *mask);
extern void wake_up_new_task(struct task_struct *tsk);

extern void kick_process(struct task_struct *tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 496dff740dca..28cf5be768c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4057,6 +4057,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
+ * @wake_mask: preferred cpumask for CPU selection, or NULL
*
* Conceptually does:
*
@@ -4086,10 +4087,16 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* As a consequence we race really badly with just about everything. See the
* many memory barriers and their comments for details.
*
+ * If @wake_mask is non-NULL, it temporarily overrides p->cpus_ptr during CPU
+ * selection so that the scheduler picks a CPU within @wake_mask. Must be a
+ * subset of p->cpus_ptr. Ignored if the task is pinned to a single CPU or has
+ * migration disabled.
+ *
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ const struct cpumask *wake_mask)
{
guard(preempt)();
int cpu, success = 0;
@@ -4128,6 +4135,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* in set_current_state() that the waiting thread does.
*/
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ const struct cpumask *prev_mask = NULL;
+
smp_mb__after_spinlock();
if (!ttwu_state_match(p, state, &success))
break;
@@ -4227,7 +4236,15 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);

+ if (wake_mask && !is_migration_disabled(p) &&
+ !WARN_ON_ONCE(!cpumask_subset(wake_mask, p->cpus_ptr))) {
+ prev_mask = p->cpus_ptr;
+ p->cpus_ptr = wake_mask;
+ }
cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+ if (prev_mask)
+ p->cpus_ptr = prev_mask;
+
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
@@ -4371,13 +4388,31 @@ struct task_struct *cpu_curr_snapshot(int cpu)
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_NORMAL, 0);
+ return try_to_wake_up(p, TASK_NORMAL, 0, NULL);
}
EXPORT_SYMBOL(wake_up_process);

+/**
+ * wake_up_process_prefer - Wake up a process with a preferred CPU mask
+ * @p: The process to be woken up.
+ * @mask: Preferred cpumask for CPU selection. Must be a subset of the
+ * task's allowed cpumask.
+ *
+ * Like wake_up_process(), but temporarily constrains CPU selection to @mask.
+ * This is a best-effort hint - the scheduler may still select a CPU outside
+ * @mask if the task is pinned or has migration disabled.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ */
+int wake_up_process_prefer(struct task_struct *p, const struct cpumask *mask)
+{
+ return try_to_wake_up(p, TASK_NORMAL, 0, mask);
+}
+EXPORT_SYMBOL(wake_up_process_prefer);
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ return try_to_wake_up(p, state, 0, NULL);
}

/*
@@ -7247,7 +7282,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
void *key)
{
WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
- return try_to_wake_up(curr->private, mode, wake_flags);
+ return try_to_wake_up(curr->private, mode, wake_flags, NULL);
}
EXPORT_SYMBOL(default_wake_function);

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..4c9d475da072 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3760,7 +3760,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);

-extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags,
+ const struct cpumask *wake_mask);

#ifdef CONFIG_PREEMPT_DYNAMIC
extern int preempt_dynamic_mode;
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 0fef6496c4c8..3eb2002fc847 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -27,7 +27,7 @@ void swake_up_locked(struct swait_queue_head *q, int wake_flags)
return;

curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
- try_to_wake_up(curr->task, TASK_NORMAL, wake_flags);
+ try_to_wake_up(curr->task, TASK_NORMAL, wake_flags, NULL);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b77119d71641..f8e60e4fe432 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1269,30 +1269,21 @@ static bool kick_pool(struct worker_pool *pool)
#ifdef CONFIG_SMP
/*
* Idle @worker is about to execute @work and waking up provides an
- * opportunity to migrate @worker at a lower cost by setting the task's
- * wake_cpu field. Let's see if we want to move @worker to improve
- * execution locality.
+ * opportunity to migrate @worker at a lower cost by providing a
+ * preferred cpumask for the wakeup. Let's see if we want to move
+ * @worker to improve execution locality.
*
- * We're waking the worker that went idle the latest and there's some
- * chance that @worker is marked idle but hasn't gone off CPU yet. If
- * so, setting the wake_cpu won't do anything. As this is a best-effort
- * optimization and the race window is narrow, let's leave as-is for
- * now. If this becomes pronounced, we can skip over workers which are
- * still on cpu when picking an idle worker.
- *
- * If @pool has non-strict affinity, @worker might have ended up outside
- * its affinity scope. Repatriate.
+ * If @pool has non-strict affinity, @worker might have ended up
+ * outside its affinity scope. Repatriate by waking it within the
+ * pool's pod cpumask.
*/
if (!pool->attrs->affn_strict &&
!cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
struct work_struct *work = list_first_entry(&pool->worklist,
struct work_struct, entry);
- int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
- cpu_online_mask);
- if (wake_cpu < nr_cpu_ids) {
- p->wake_cpu = wake_cpu;
- get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
- }
+ wake_up_process_prefer(p, pool->attrs->__pod_cpumask);
+ get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ return true;
}
#endif
wake_up_process(p);