[PATCH 07/10] sched_ext: Save/restore kf_tasks[] when task ops nest
From: Andrea Righi
Date: Wed May 06 2026 - 13:48:51 EST
SCX_CALL_OP_TASK*() stored the subject task in current->scx.kf_tasks[]
and assumed ops would not nest. A BPF ops.running() callback can call
kfuncs (e.g. scx_bpf_dsq_insert) that enqueue work and trigger
enqueue_task_scx() -> ops.runnable(), which used SCX_CALL_OP_TASK again
and overwrote kf_tasks[0] then cleared it, leaving the running context
wrong and leading to NULL function dispatches from BPF helpers.
Save and restore kf_tasks[] (both slots for the two-task variant) around
each invocation so nested task-based ops preserve the outer context.
Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
kernel/sched/ext.c | 43 +++++++++++++++++++++++++++++++------------
1 file changed, 31 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b6d29087ec0e8..1ac885eadfa8e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -567,37 +567,50 @@ static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid)
* pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu.
* So if kf_tasks[] is set, @p's scheduler-protected fields are stable.
*
- * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
- * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
- * while a previous one is still in progress.
+ * Task-based SCX ops may nest (e.g. ops.running() calling a kfunc that ends up
+ * in enqueue_task_scx() -> ops.runnable()). Save and restore kf_tasks[] around
+ * each invocation so the outer op's context is restored for kfuncs and for
+ * further nested calls. Single-task ops save/restore both slots and clear
+ * kf_tasks[1] while active so a nested call under SCX_CALL_OP_2TASKS_RET does
+ * not leave the outer pair's second task authenticated for kfuncs.
*/
#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \
do { \
- WARN_ON_ONCE(current->scx.kf_tasks[0]); \
+ struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0]; \
+ struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1]; \
+ \
current->scx.kf_tasks[0] = task; \
+ current->scx.kf_tasks[1] = NULL; \
SCX_CALL_OP((sch), op, locked_rq, task, ##args); \
- current->scx.kf_tasks[0] = NULL; \
+ current->scx.kf_tasks[0] = __scx_kf0_sv; \
+ current->scx.kf_tasks[1] = __scx_kf1_sv; \
} while (0)
#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \
({ \
__typeof__((sch)->ops.op(task, ##args)) __ret; \
- WARN_ON_ONCE(current->scx.kf_tasks[0]); \
+ struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0]; \
+ struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1]; \
+ \
current->scx.kf_tasks[0] = task; \
+ current->scx.kf_tasks[1] = NULL; \
__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \
- current->scx.kf_tasks[0] = NULL; \
+ current->scx.kf_tasks[0] = __scx_kf0_sv; \
+ current->scx.kf_tasks[1] = __scx_kf1_sv; \
__ret; \
})
#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \
({ \
__typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
- WARN_ON_ONCE(current->scx.kf_tasks[0]); \
+ struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0]; \
+ struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1]; \
+ \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \
- current->scx.kf_tasks[0] = NULL; \
- current->scx.kf_tasks[1] = NULL; \
+ current->scx.kf_tasks[0] = __scx_kf0_sv; \
+ current->scx.kf_tasks[1] = __scx_kf1_sv; \
__ret; \
})
@@ -616,8 +629,12 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
struct task_struct *task,
const struct cpumask *cpumask)
{
- WARN_ON_ONCE(current->scx.kf_tasks[0]);
+ struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0];
+ struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1];
+
+ current->scx.kf_nest++;
current->scx.kf_tasks[0] = task;
+ current->scx.kf_tasks[1] = NULL;
if (rq)
update_locked_rq(rq);
@@ -633,7 +650,9 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
if (rq)
update_locked_rq(NULL);
- current->scx.kf_tasks[0] = NULL;
+ current->scx.kf_tasks[0] = __scx_kf0_sv;
+ current->scx.kf_tasks[1] = __scx_kf1_sv;
+ current->scx.kf_nest--;
}
/* see SCX_CALL_OP_TASK() */
--
2.54.0