[PATCH 08/10] sched_ext: Skip ops.runnable() when nested in SCX_CALL_OP_TASK
From: Andrea Righi
Date: Wed May 06 2026 - 13:49:48 EST
ops.running() can pull in enqueue_task_scx() -> ops.runnable() on the
same current task while kf_tasks[] save/restore is still insufficient
for every BPF/kfunc combination, leading to NULL dispatches and stack
corruption.
Track SCX_CALL_OP_TASK nesting in current->scx.kf_nest (incremented by
all SCX_CALL_OP_TASK* macros) and omit the ops.runnable() callback when
non-zero. The full enqueue path including ops.enqueue() still runs, only
the runnable hook is skipped in this case.
Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
include/linux/sched/ext.h | 7 +++++++
kernel/sched/ext.c | 9 ++++++++-
2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 5096c05d7a978..8c04edf1bc91a 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -197,6 +197,13 @@ struct sched_ext_entity {
s32 holding_cpu;
s32 selected_cpu;
struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
+ /*
+ * Nesting depth of SCX_CALL_OP_TASK() on this task as %current (e.g.
+ * during schedule() %current is still the previous task). Used to skip
+ * ops.runnable() when invoked from inside another task op such as
+ * ops.running() to avoid breaking BPF re-entrance guarantees.
+ */
+ u32 kf_nest;
struct list_head runnable_node; /* rq->scx.runnable_list */
unsigned long runnable_at;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1ac885eadfa8e..af9b10cd82c4a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -579,11 +579,13 @@ do { \
struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0]; \
struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1]; \
\
+ current->scx.kf_nest++; \
current->scx.kf_tasks[0] = task; \
current->scx.kf_tasks[1] = NULL; \
SCX_CALL_OP((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = __scx_kf0_sv; \
current->scx.kf_tasks[1] = __scx_kf1_sv; \
+ current->scx.kf_nest--; \
} while (0)
#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \
@@ -592,11 +594,13 @@ do { \
struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0]; \
struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1]; \
\
+ current->scx.kf_nest++; \
current->scx.kf_tasks[0] = task; \
current->scx.kf_tasks[1] = NULL; \
__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \
current->scx.kf_tasks[0] = __scx_kf0_sv; \
current->scx.kf_tasks[1] = __scx_kf1_sv; \
+ current->scx.kf_nest--; \
__ret; \
})
@@ -606,11 +610,13 @@ do { \
struct task_struct *__scx_kf0_sv = current->scx.kf_tasks[0]; \
struct task_struct *__scx_kf1_sv = current->scx.kf_tasks[1]; \
\
+ current->scx.kf_nest++; \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
__ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = __scx_kf0_sv; \
current->scx.kf_tasks[1] = __scx_kf1_sv; \
+ current->scx.kf_nest--; \
__ret; \
})
@@ -2067,7 +2073,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p) &&
+ !READ_ONCE(current->scx.kf_nest))
SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
--
2.54.0