[PATCH 2/2] workqueue: defer wake_up_process() outside pool->lock on hot paths

From: Breno Leitao

Date: Tue May 26 2026 - 14:12:48 EST

Both __queue_work() (enqueue) and process_one_work() (per-work chain
kick on unbound/CPU_INTENSIVE pools) call kick_pool() while holding
pool->lock. kick_pool() ends in wake_up_process(), which takes the
target task's rq->lock. Holding pool->lock across that runqueue lock
acquisition lengthens the locked region on the two hottest paths of a
contended unbound workqueue.

Use the new kick_pool_pick() helper to select the worker to wake
while holding pool->lock, then call wake_up_process() after pool->lock
is released.

All state that requires pool->lock (worker selection, wake_cpu
adjustment, BH-pool fast path) is still done under the lock; only the
unrelated rq->lock acquisition is moved out.

Measured on a CONFIG_SMP arm64 VM (8 vCPUs) with the test_workqueue
benchmark (lib/test_workqueue.c) using a batched-submit mode (8
producer kthreads, 200000 work items each, WQ_UNBOUND). Averages of
five runs per scope:

affinity_scope baseline (items/s) patched (items/s) gain
-------------- ------------------ ----------------- ----
cpu 1,419,973 1,413,896 -0.4 % (no contention)
smt 1,442,921 1,437,164 -0.4 % (no contention)
cache_shard 1,184,058 1,279,184 +8.0 %
cache 1,167,603 1,271,341 +8.9 %
numa 1,163,617 1,285,427 +10.5 %
system 1,175,933 1,255,227 +6.7 %

Enqueue latency on the contended scopes also drops (p50 ~2875 -> ~2625
ns, p99 ~5000 -> ~4200 ns). The cpu/smt scopes use per-CPU pools with
no producer/consumer contention, so as expected they are unchanged.

Signed-off-by: Breno Leitao <leitao@xxxxxxxxxx>
---
kernel/workqueue.c | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b788d7c44ac0..1403a4b195a3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2301,6 +2301,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
{
struct pool_workqueue *pwq;
struct worker_pool *last_pool, *pool;
+ struct task_struct *wake_p = NULL;
unsigned int work_flags;
unsigned int req_cpu = cpu;

@@ -2415,7 +2416,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,

trace_workqueue_activate_work(work);
insert_work(pwq, work, &pool->worklist, work_flags);
- kick_pool(pool);
+ wake_p = kick_pool_pick(pool);
} else {
work_flags |= WORK_STRUCT_INACTIVE;
insert_work(pwq, work, &pwq->inactive_works, work_flags);
@@ -2423,6 +2424,15 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,

out:
raw_spin_unlock(&pool->lock);
+ /*
+ * Issue the wakeup after dropping pool->lock to shorten the
+ * locked region on this hot enqueue path. kick_pool_pick() did all
+ * of the work that required the lock (worker selection and
+ * wake_cpu setup); the wake_up_process() itself only needs to
+ * take the target rq->lock.
+ */
+ if (wake_p)
+ wake_up_process(wake_p);
rcu_read_unlock();
}

@@ -3243,6 +3253,7 @@ __acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
+ struct task_struct *wake_p;
unsigned long work_data;
int lockdep_start_depth, rcu_start_depth;
bool bh_draining = pool->flags & POOL_BH_DRAINING;
@@ -3296,8 +3307,11 @@ __acquires(&pool->lock)
* since nr_running would always be >= 1 at this point. This is used to
* chain execution of the pending work items for WORKER_NOT_RUNNING
* workers such as the UNBOUND and CPU_INTENSIVE ones.
+ *
+ * Select the worker to wake while holding pool->lock, but defer the
+ * actual wake_up_process() until after the lock is dropped below.
*/
- kick_pool(pool);
+ wake_p = kick_pool_pick(pool);

/*
* Record the last pool and clear PENDING which should be the last
@@ -3310,6 +3324,9 @@ __acquires(&pool->lock)
pwq->stats[PWQ_STAT_STARTED]++;
raw_spin_unlock_irq(&pool->lock);

+ if (wake_p)
+ wake_up_process(wake_p);
+
rcu_start_depth = rcu_preempt_depth();
lockdep_start_depth = lockdep_depth(current);
/* see drain_dead_softirq_workfn() */

--
2.51.0