Re: [PATCH 06/15] sched_ext: Convert deferred_reenq_locals from llist to regular list

From: Emil Tsalapatis

Date: Mon Mar 09 2026 - 13:21:44 EST

On Mon Mar 9, 2026 at 1:12 PM EDT, Emil Tsalapatis wrote:
> On Fri Mar 6, 2026 at 2:06 PM EST, Tejun Heo wrote:
>> The deferred reenqueue local mechanism uses an llist (lockless list) for
>> collecting schedulers that need their local DSQs re-enqueued. Convert to a
>> regular list protected by a raw_spinlock.
>>
>> The llist was used for its lockless properties, but the upcoming changes to
>> support remote reenqueue require more complex list operations that are
>> difficult to implement correctly with lockless data structures. A spinlock-
>> protected regular list provides the necessary flexibility.
>>
>> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
>
>
> Reviewed-by: Emil Tsalapatis <emil@xxxxxxxxxxxxxxx>
>
>> ---
>> kernel/sched/ext.c | 57 ++++++++++++++++++++++++-------------
>> kernel/sched/ext_internal.h | 2 +-
>> kernel/sched/sched.h | 3 +-
>> 3 files changed, 41 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
>> index 1b6cd1e4f8b9..ffccaf04e34d 100644
>> --- a/kernel/sched/ext.c
>> +++ b/kernel/sched/ext.c
>> @@ -3640,23 +3640,37 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
>> return nr_enqueued;
>> }
>>
>> -static void run_deferred(struct rq *rq)
>> +static void process_deferred_reenq_locals(struct rq *rq)
>> {
>> - process_ddsp_deferred_locals(rq);
>> -
>> - if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
>> - struct llist_node *llist =
>> - llist_del_all(&rq->scx.deferred_reenq_locals);
>> - struct scx_sched_pcpu *pos, *next;
>> + lockdep_assert_rq_held(rq);
>>
>> - llist_for_each_entry_safe(pos, next, llist,
>> - deferred_reenq_locals_node) {
>> - init_llist_node(&pos->deferred_reenq_locals_node);
>> - reenq_local(pos->sch, rq);
>> + while (true) {
>> + struct scx_sched *sch;
>> +
>> + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
>> + struct scx_sched_pcpu *sch_pcpu =
>> + list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
>> + struct scx_sched_pcpu,
>> + deferred_reenq_local_node);
>> + if (!sch_pcpu)
>> + return;
>> +
>> + sch = sch_pcpu->sch;
>
> While both scx and sch_pcpu aren't used in this patch, they are useful
> for subsequent patches.
>

This comment was meant for the next patch in the series, sorry about
that. The review tag still applies.

>> + list_del_init(&sch_pcpu->deferred_reenq_local_node);
>> }
>> +
>> + reenq_local(sch, rq);
>> }
>> }
>>
>> +static void run_deferred(struct rq *rq)
>> +{
>> + process_ddsp_deferred_locals(rq);
>> +
>> + if (!list_empty(&rq->scx.deferred_reenq_locals))
>> + process_deferred_reenq_locals(rq);
>> +}
>> +
>> #ifdef CONFIG_NO_HZ_FULL
>> bool scx_can_stop_tick(struct rq *rq)
>> {
>> @@ -4180,13 +4194,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
>>
>> /*
>> * $sch would have entered bypass mode before the RCU grace period. As
>> - * that blocks new deferrals, all deferred_reenq_locals_node's must be
>> + * that blocks new deferrals, all deferred_reenq_local_node's must be
>> * off-list by now.
>> */
>> for_each_possible_cpu(cpu) {
>> struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
>>
>> - WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
>> + WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
>> }
>>
>> free_percpu(sch->pcpu);
>> @@ -5799,7 +5813,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
>> struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
>>
>> pcpu->sch = sch;
>> - init_llist_node(&pcpu->deferred_reenq_locals_node);
>> + INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
>> }
>>
>> sch->helper = kthread_run_worker(0, "sched_ext_helper");
>> @@ -7126,7 +7140,8 @@ void __init init_sched_ext_class(void)
>> BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
>> BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
>> BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
>> - init_llist_head(&rq->scx.deferred_reenq_locals);
>> + raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
>> + INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
>> rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
>> rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
>>
>> @@ -8358,7 +8373,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
>> unsigned long flags;
>> struct scx_sched *sch;
>> struct rq *rq;
>> - struct llist_node *lnode;
>>
>> raw_local_irq_save(flags);
>>
>> @@ -8374,9 +8388,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
>> goto out_irq_restore;
>>
>> rq = this_rq();
>> - lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
>> - if (!llist_on_list(lnode))
>> - llist_add(lnode, &rq->scx.deferred_reenq_locals);
>> + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
>> + struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
>> +
>> + if (list_empty(&pcpu->deferred_reenq_local_node))
>> + list_move_tail(&pcpu->deferred_reenq_local_node,
>> + &rq->scx.deferred_reenq_locals);
>> + }
>> +
>> schedule_deferred(rq);
>> out_irq_restore:
>> raw_local_irq_restore(flags);
>> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
>> index 9e5ebd00ea0c..80d40a9c5ad9 100644
>> --- a/kernel/sched/ext_internal.h
>> +++ b/kernel/sched/ext_internal.h
>> @@ -965,7 +965,7 @@ struct scx_sched_pcpu {
>> */
>> struct scx_event_stats event_stats;
>>
>> - struct llist_node deferred_reenq_locals_node;
>> + struct list_head deferred_reenq_local_node;
>> struct scx_dispatch_q bypass_dsq;
>> #ifdef CONFIG_EXT_SUB_SCHED
>> u32 bypass_host_seq;
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index ebe971d12cb8..0794852524e7 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -808,7 +808,8 @@ struct scx_rq {
>>
>> struct task_struct *sub_dispatch_prev;
>>
>> - struct llist_head deferred_reenq_locals;
>> + raw_spinlock_t deferred_reenq_lock;
>> + struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */
>> struct balance_callback deferred_bal_cb;
>> struct irq_work deferred_irq_work;
>> struct irq_work kick_cpus_irq_work;