[PATCH sched_ext/for-7.0-fixes] sched_ext: Drop rq lock before calling ops.exit_task()

From: Andrea Righi

Date: Wed Mar 11 2026 - 19:15:05 EST

sched_ext_dead() calls scx_exit_task() while holding the rq lock, which
invokes ops.exit_task(). If the BPF program calls helpers that acquire
non-raw locks (e.g., bpf_task_storage_delete()), this can trigger the
following BUG:

=============================
[ BUG: Invalid wait context ]
7.0.0-rc1-virtme #1 Not tainted
-----------------------------
(udev-worker)/115 is trying to lock:
ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
other info that might help us debug this:
context-{5:5}
3 locks held by (udev-worker)/115:
#0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
#1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
#2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
stack backtrace:
...
Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
Call Trace:
<TASK>
__lock_acquire+0xf86/0x1de0
lock_acquire+0xcf/0x310
_raw_spin_lock_irqsave+0x39/0x60
spin_lock_irqsave_ssp_contention+0x54/0x90
srcu_gp_start_if_needed+0x2a7/0x490
bpf_selem_unlink+0x24b/0x590
bpf_task_storage_delete+0x3a/0x90
bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
bpf__sched_ext_ops_exit_task+0x4b/0xa7

Fix this by extending scx_exit_task() to take optional rq and rq_flags
pointers. When they are provided, temporarily drop the rq lock before
invoking ops.exit_task() and re-acquire it afterwards. When they are
NULL, call ops.exit_task() with the rq lock held as before.

After dropping the rq lock around ops.exit_task(), interrupts are
enabled, so an interrupt can potentially run and call
enqueue_task_scx(), which uses SCX_KF_ENQUEUE; scx_kf_allow() would
treat this as invalid nesting because the interrupted context still has
SCX_KF_REST set (from ops.exit_task()). This nesting should be
legitimate when the inner call is from an interrupt handler, so skip the
nesting check when in_interrupt() is true.

Fixes: 7900aa699c34 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch()")
Cc: stable@xxxxxxxxxxxxxxx # v6.19+
Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
kernel/sched/ext.c | 30 ++++++++++++++++++++----------
1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1594987d637b0..37415713b7c0b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -275,9 +275,10 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
static __always_inline void scx_kf_allow(u32 mask)
{
/* nesting is allowed only in increasing scx_kf_mask order */
- WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
- "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
- current->scx.kf_mask, mask);
+ if (!in_interrupt())
+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
+ current->scx.kf_mask, mask);
current->scx.kf_mask |= mask;
barrier();
}
@@ -2968,7 +2969,8 @@ static void scx_disable_task(struct task_struct *p)
scx_set_task_state(p, SCX_TASK_READY);
}

-static void scx_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p, struct rq **rq,
+ struct rq_flags *rf)
{
struct scx_sched *sch = scx_root;
struct scx_exit_task_args args = {
@@ -2993,9 +2995,17 @@ static void scx_exit_task(struct task_struct *p)
return;
}

- if (SCX_HAS_OP(sch, exit_task))
- SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
- p, &args);
+ if (SCX_HAS_OP(sch, exit_task)) {
+ if (rq && rf) {
+ task_rq_unlock(*rq, p, rf);
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, NULL, p, &args);
+ *rq = task_rq_lock(p, rf);
+ } else {
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+ p, &args);
+ }
+ }
+
scx_set_task_state(p, SCX_TASK_NONE);
}

@@ -3068,7 +3078,7 @@ void scx_cancel_fork(struct task_struct *p)

rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_exit_task(p);
+ scx_exit_task(p, &rq, &rf);
task_rq_unlock(rq, p, &rf);
}

@@ -3127,7 +3137,7 @@ void sched_ext_dead(struct task_struct *p)
struct rq *rq;

rq = task_rq_lock(p, &rf);
- scx_exit_task(p);
+ scx_exit_task(p, &rq, &rf);
task_rq_unlock(rq, p, &rf);
}
}
@@ -4359,7 +4369,7 @@ static void scx_disable_workfn(struct kthread_work *work)
p->sched_class = new_class;
}

- scx_exit_task(p);
+ scx_exit_task(p, NULL, NULL);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
--
2.53.0