linux-next: manual merge of the sched-ext tree with the tip tree

From: Mark Brown

Date: Mon Mar 09 2026 - 14:55:29 EST

Hi all,

Today's linux-next merge of the sched-ext tree got a conflict in:

kernel/sched/ext.c

between commits:

c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq")

from the tip tree and commit:

cde94c032b32b ("sched_ext: Make watchdog sub-sched aware")

from the sched-ext tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging. You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

I do note there's another system_unbound_wq usage there which for some
reason wasn't updated...

diff --cc kernel/sched/ext.c
index 7278d57496478,d6d8073370130..0000000000000
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@@ -2766,8 -3205,11 +3205,11 @@@ static void scx_watchdog_workfn(struct

cond_resched();
}
- queue_delayed_work(system_dfl_wq, to_delayed_work(work),
- READ_ONCE(scx_watchdog_timeout) / 2);
+
+ intv = READ_ONCE(scx_watchdog_interval);
+ if (intv < ULONG_MAX)
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
++ queue_delayed_work(system_dfl_wq, to_delayed_work(work),
+ intv);
}

void scx_tick(struct rq *rq)
@@@ -4282,9 -5218,247 +5218,247 @@@ static void free_kick_syncs(void
}
}

- static void scx_disable_workfn(struct kthread_work *work)
+ static void refresh_watchdog(void)
+ {
+ struct scx_sched *sch;
+ unsigned long intv = ULONG_MAX;
+
+ /* take the shortest timeout and use its half for watchdog interval */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ rcu_read_unlock();
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ WRITE_ONCE(scx_watchdog_interval, intv);
+
+ if (intv < ULONG_MAX)
- mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
++ mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
+ else
+ cancel_delayed_work_sync(&scx_watchdog_work);
+ }
+
+ static s32 scx_link_sched(struct scx_sched *sch)
+ {
+ scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+ #ifdef CONFIG_EXT_SUB_SCHED
+ struct scx_sched *parent = scx_parent(sch);
+ s32 ret;
+
+ if (parent) {
+ ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
+ &sch->hash_node, scx_sched_hash_params);
+ if (ret) {
+ scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
+ return ret;
+ }
+
+ list_add_tail(&sch->sibling, &parent->children);
+ }
+ #endif /* CONFIG_EXT_SUB_SCHED */
+
+ list_add_tail_rcu(&sch->all, &scx_sched_all);
+ }
+
+ refresh_watchdog();
+ return 0;
+ }
+
+ static void scx_unlink_sched(struct scx_sched *sch)
+ {
+ scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+ #ifdef CONFIG_EXT_SUB_SCHED
+ if (scx_parent(sch)) {
+ rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
+ scx_sched_hash_params);
+ list_del_init(&sch->sibling);
+ }
+ #endif /* CONFIG_EXT_SUB_SCHED */
+ list_del_rcu(&sch->all);
+ }
+
+ refresh_watchdog();
+ }
+
+ #ifdef CONFIG_EXT_SUB_SCHED
+ static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+ static void drain_descendants(struct scx_sched *sch)
+ {
+ /*
+ * Child scheds that finished the critical part of disabling will take
+ * themselves off @sch->children. Wait for it to drain. As propagation
+ * is recursive, empty @sch->children means that all proper descendant
+ * scheds reached unlinking stage.
+ */
+ wait_event(scx_unlink_waitq, list_empty(&sch->children));
+ }
+
+ static void scx_fail_parent(struct scx_sched *sch,
+ struct task_struct *failed, s32 fail_code)
+ {
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+
+ scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ fail_code, failed->comm, failed->pid);
+
+ /*
+ * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ * it. This may cause downstream failures on the BPF side but $parent is
+ * dying anyway.
+ */
+ scx_bypass(parent, true);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ scx_disable_and_exit_task(sch, p);
+ rcu_assign_pointer(p->scx.sched, parent);
+ }
+ }
+ scx_task_iter_stop(&sti);
+ }
+
+ static void scx_sub_disable(struct scx_sched *sch)
+ {
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int ret;
+
+ /*
+ * Guarantee forward progress and wait for descendants to be disabled.
+ * To limit disruptions, $parent is not bypassed. Tasks are fully
+ * prepped and then inserted back into $parent.
+ */
+ scx_bypass(sch, true);
+ drain_descendants(sch);
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_enable_mutex);
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ set_cgroup_sched(sch_cgroup(sch), parent);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /* filter out duplicate visits */
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ /*
+ * By the time control reaches here, all descendant schedulers
+ * should already have been disabled.
+ */
+ WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+ /*
+ * If $p is about to be freed, nothing prevents $sch from
+ * unloading before $p reaches sched_ext_free(). Disable and
+ * exit $p right away.
+ */
+ if (!tryget_task_struct(p)) {
+ scx_disable_and_exit_task(sch, p);
+ continue;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * $p is READY or ENABLED on @sch. Initialize for $parent,
+ * disable and exit from @sch, and then switch over to $parent.
+ *
+ * If a task fails to initialize for $parent, the only available
+ * action is disabling $parent too. While this allows disabling
+ * of a child sched to cause the parent scheduler to fail, the
+ * failure can only originate from ops.init_task() of the
+ * parent. A child can't directly affect the parent through its
+ * own failures.
+ */
+ ret = __scx_init_task(parent, p, false);
+ if (ret) {
+ scx_fail_parent(sch, p, ret);
+ put_task_struct(p);
+ break;
+ }
+
+ rq = task_rq_lock(p, &rf);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p is initialized for $parent and still attached to
+ * @sch. Disable and exit for @sch, switch over to
+ * $parent, override the state to READY to account for
+ * $p having already been initialized, and then enable.
+ */
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT);
+ rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_state(p, SCX_TASK_READY);
+ scx_enable_task(parent, p);
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * All tasks are moved off of @sch but there may still be on-going
+ * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ * the expedited version as ancestors may be waiting in bypass mode.
+ * Also, tell the parent that there is no need to keep running bypass
+ * DSQs for us.
+ */
+ synchronize_rcu_expedited();
+ disable_bypass_dsp(sch);
+
+ scx_unlink_sched(sch);
+
+ mutex_unlock(&scx_enable_mutex);
+
+ /*
+ * @sch is now unlinked from the parent's children list. Notify and call
+ * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ * after unlinking and releasing all locks. See scx_claim_exit().
+ */
+ wake_up_all(&scx_unlink_waitq);
+
+ if (sch->ops.sub_detach && sch->sub_attached) {
+ struct scx_sub_detach_args sub_detach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+ SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+ &sub_detach_args);
+ }
+
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+ kobject_del(&sch->kobj);
+ }
+ #else /* CONFIG_EXT_SUB_SCHED */
+ static void drain_descendants(struct scx_sched *sch) { }
+ static void scx_sub_disable(struct scx_sched *sch) { }
+ #endif /* CONFIG_EXT_SUB_SCHED */
+
+ static void scx_root_disable(struct scx_sched *sch)
{
- struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;

Attachment: signature.asc
Description: PGP signature