[tip: sched/core] sched_ext: Auto-register/unregister dl_server reservations

From: tip-bot2 for Andrea Righi

Date: Fri May 29 2026 - 06:47:02 EST

The following commit has been merged into the sched/core branch of tip:

Commit-ID: e7b63427fdb4977621d69085a97272c8856644fe
Gitweb: https://git.kernel.org/tip/e7b63427fdb4977621d69085a97272c8856644fe
Author: Andrea Righi <arighi@xxxxxxxxxx>
AuthorDate: Tue, 26 May 2026 18:42:48 +02:00
Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CommitterDate: Fri, 29 May 2026 12:43:15 +02:00

sched_ext: Auto-register/unregister dl_server reservations

Commit cd959a3562050d ("sched_ext: Add a DL server for sched_ext tasks")
introduced an ext_server deadline server to protect sched_ext tasks from
fair/RT starvation, mirroring the existing fair_server.

Currently, both servers reserve their 50ms/1000ms bandwidth at boot,
regardless of whether a BPF scheduler is loaded. Unused bandwidth is
still reclaimed at runtime by other classes, but the static reservation
prevents the RT class from implicitly using that headroom when one of
the two classes is guaranteed to be empty.

A sysadmin can work around this by writing
/sys/kernel/debug/sched/{fair,ext}_server/cpu*/runtime, but that
requires manual action and not all systems expose debugfs.

A better approach is to make server bandwidth reservations dynamic: only
the scheduling policy that is currently active should register its
reservation, while the inactive one should not artificially hold
capacity (keeping both reservations only when the BPF scheduler is
running in partial mode):

+---------------------------------------------+-------------+------------+
| BPF scheduler state | fair server | ext server |
+---------------------------------------------+-------------+------------+
| not loaded (default boot) | reserved | none |
| loaded full mode (!SCX_OPS_SWITCH_PARTIAL) | none | reserved |
| loaded partial mode (SCX_OPS_SWITCH_PARTIAL)| reserved | reserved |
+---------------------------------------------+-------------+------------+

To achieve this, introduce an "attached/detached" state for each
deadline server, so the kernel can decide whether a server's bandwidth
should be accounted in global bandwidth tracking.

At boot, the system starts with only the fair server contributing to
bandwidth accounting. When a BPF scheduler is enabled, the ext server is
attached and may replace or complement the fair server depending on
whether full or partial mode is used. When sched_ext is disabled, the
system restores the previous deadline bandwidth values and behavior.

The transition logic ensures that switching between scheduling modes is
consistent and reversible, without losing runtime configuration or
requiring manual intervention.

Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Acked-by: Juri Lelli <juri.lelli@xxxxxxxxxx>
Link: https://patch.msgid.link/20260526164420.638711-2-arighi@xxxxxxxxxx
---
include/linux/sched.h | 6 +-
kernel/sched/deadline.c | 204 +++++++++++++++++++++++++++++++++++++--
kernel/sched/ext.c | 71 ++++++++++++++-
kernel/sched/sched.h | 4 +-
4 files changed, 278 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index da6a090..8130d13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -702,6 +702,11 @@ struct sched_dl_entity {
* running, skipping the defer phase.
*
* @dl_defer_idle tracks idle state
+ *
+ * @dl_bw_attached tells if this server's bandwidth currently
+ * contributes to the root domain's total_bw. Only meaningful for server
+ * entities (@dl_server == 1). Allows toggling the reservation on/off
+ * without losing the configured @dl_runtime/@dl_period.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
@@ -713,6 +718,7 @@ struct sched_dl_entity {
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
unsigned int dl_defer_idle : 1;
+ unsigned int dl_bw_attached : 1;

/*
* Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b60e2df..f9e62ed 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1797,7 +1797,8 @@ void dl_server_start(struct sched_dl_entity *dl_se)
struct rq *rq = dl_se->rq;

dl_se->dl_defer_idle = 0;
- if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
+ if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime ||
+ !dl_se->dl_bw_attached)
return;

/*
@@ -1872,6 +1873,13 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+ /*
+ * No BPF scheduler is loaded at boot, so the ext_server has no
+ * tasks to protect. Detach its bandwidth reservation, it will
+ * be attached when a BPF scheduler is loaded.
+ */
+ dl_server_detach_bw(dl_se);
#endif
}
}
@@ -1882,6 +1890,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
int cpu = cpu_of(rq);
struct dl_bw *dl_b;

+ if (!dl_se->dl_bw_attached)
+ return;
+
dl_b = dl_bw_of(cpu_of(rq));
guard(raw_spinlock)(&dl_b->lock);

@@ -1893,7 +1904,8 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)

int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
{
- u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 :
+ to_ratio(dl_se->dl_period, dl_se->dl_runtime);
u64 new_bw = to_ratio(period, runtime);
struct rq *rq = dl_se->rq;
int cpu = cpu_of(rq);
@@ -1913,7 +1925,8 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
if (init) {
__add_rq_bw(new_bw, &rq->dl);
__dl_add(dl_b, new_bw, cpus);
- } else {
+ dl_se->dl_bw_attached = 1;
+ } else if (dl_se->dl_bw_attached) {
__dl_sub(dl_b, dl_se->dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);

@@ -1934,6 +1947,181 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
}

/*
+ * Add @dl_se's bw to the root-domain accounting.
+ *
+ * Return -EBUSY if attaching would overflow root domain capacity.
+ */
+static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se,
+ struct dl_bw *dl_b, int cpus)
+{
+ struct rq *rq = dl_se->rq;
+ unsigned long cap;
+
+ /*
+ * Always update @rq->dl.this_bw, but only update @dl_b->total_bw
+ * (and run the overflow check it gates) while this CPU is active.
+ *
+ * This mirrors dl_server_add_bw() during root-domain rebuilds, which
+ * only publishes bandwidth from active CPUs into @dl_b.
+ */
+ if (cpu_active(cpu_of(rq))) {
+ cap = dl_bw_capacity(cpu_of(rq));
+ if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw))
+ return -EBUSY;
+ __dl_add(dl_b, dl_se->dl_bw, cpus);
+ }
+ __add_rq_bw(dl_se->dl_bw, &rq->dl);
+ dl_se->dl_bw_attached = 1;
+
+ return 0;
+}
+
+/*
+ * Drain @dl_se and remove its bw from the root-domain accounting.
+ */
+static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se,
+ struct dl_bw *dl_b, int cpus)
+{
+ struct rq *rq = dl_se->rq;
+
+ /*
+ * If the server is still active (on_rq), dequeue it via
+ * dl_server_stop(); task_non_contending() will either subtract
+ * @dl_bw from running_bw immediately (0-lag passed) or set
+ * dl_non_contending and arm the inactive_timer.
+ */
+ if (dl_se->dl_server_active)
+ dl_server_stop(dl_se);
+
+ /*
+ * Drop @dl_se's contribution from this rq's bandwidth accounting,
+ * mirroring the __add_rq_bw() done at attach time.
+ */
+ dl_rq_change_utilization(rq, dl_se, 0);
+
+ /*
+ * Update @dl_b only while this CPU is active, matching
+ * dl_server_add_bw() during root-domain rebuilds.
+ *
+ * If this CPU is inactive, its bandwidth is not currently accounted in
+ * @dl_b->total_bw: either attach skipped adding it, or a rebuild
+ * already dropped it while re-publishing active CPUs only.
+ *
+ * In that case there is nothing to subtract from @dl_b. Just clear
+ * @dl_se->dl_bw_attached; if the CPU becomes active again, the next
+ * rebuild will re-publish its bandwidth.
+ */
+ if (cpu_active(cpu_of(rq)))
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ dl_se->dl_bw_attached = 0;
+}
+
+/*
+ * Attach @dl_se's bandwidth to the root domain's total_bw accounting.
+ *
+ * Use to dynamically register a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * already attached.
+ *
+ * Returns -EBUSY if attaching would overflow the root domain capacity.
+ */
+int dl_server_attach_bw(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus, ret;
+
+ if (dl_se->dl_bw_attached)
+ return 0;
+
+ scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+ ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus);
+ }
+ if (ret)
+ return ret;
+
+ /*
+ * The natural 0->nr_running transition that triggers dl_server_start()
+ * may have happened while @dl_se was still detached (e.g., between
+ * scx_bypass(false) and the scx_enable() re-balance loop), so kick a
+ * start here.
+ *
+ * dl_server_start() bails out cleanly if there's nothing to schedule or
+ * it's already active. Skip if @cpu is offline; the server will be
+ * started naturally on the first enqueue once @cpu comes back.
+ */
+ if (cpu_online(cpu))
+ dl_server_start(dl_se);
+
+ return 0;
+}
+
+/*
+ * Detach @dl_se's bandwidth from the root domain's total_bw accounting.
+ *
+ * Use to dynamically unregister a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * not currently attached.
+ */
+void dl_server_detach_bw(struct sched_dl_entity *dl_se)
+{
+ int cpu = cpu_of(dl_se->rq);
+ struct dl_bw *dl_b;
+ int cpus;
+
+ if (!dl_se->dl_bw_attached)
+ return;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+ cpus = dl_bw_cpus(cpu);
+ __dl_server_detach_bw_locked(dl_se, dl_b, cpus);
+}
+
+/*
+ * Atomically detach @detach_se and attach @attach_se on the same rq, holding
+ * @dl_b->lock across both operations so a concurrent sched_setattr() cannot
+ * steal the bandwidth freed by the detach before the attach can claim it.
+ *
+ * Both entities must live on the same rq (same root domain). Returns the
+ * result of the attach: -EBUSY if attaching @attach_se would overflow root
+ * domain capacity (in which case both servers end up detached).
+ */
+int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+ struct sched_dl_entity *attach_se)
+{
+ struct rq *rq = detach_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus, ret;
+
+ WARN_ON_ONCE(attach_se->rq != rq);
+
+ scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) {
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+
+ if (detach_se->dl_bw_attached)
+ __dl_server_detach_bw_locked(detach_se, dl_b, cpus);
+
+ if (attach_se->dl_bw_attached)
+ ret = 0;
+ else
+ ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus);
+ }
+ if (ret)
+ return ret;
+
+ if (cpu_online(cpu))
+ dl_server_start(attach_se);
+
+ return 0;
+}
+
+/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
*/
@@ -3233,12 +3421,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu)
struct sched_dl_entity *dl_se;

dl_se = &cpu_rq(cpu)->fair_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));

#ifdef CONFIG_SCHED_CLASS_EXT
dl_se = &cpu_rq(cpu)->ext_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
#endif
}
@@ -3247,11 +3435,13 @@ static u64 dl_server_read_bw(int cpu)
{
u64 dl_bw = 0;

- if (cpu_rq(cpu)->fair_server.dl_server)
+ if (cpu_rq(cpu)->fair_server.dl_server &&
+ cpu_rq(cpu)->fair_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->fair_server.dl_bw;

#ifdef CONFIG_SCHED_CLASS_EXT
- if (cpu_rq(cpu)->ext_server.dl_server)
+ if (cpu_rq(cpu)->ext_server.dl_server &&
+ cpu_rq(cpu)->ext_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
#endif

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 345aa11..f412c4b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5814,6 +5814,7 @@ static void scx_root_disable(struct scx_sched *sch)
struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
+ bool was_switched_all;
int cpu;

/* guarantee forward progress and wait for descendants to be disabled */
@@ -5840,6 +5841,8 @@ static void scx_root_disable(struct scx_sched *sch)
*/
mutex_lock(&scx_enable_mutex);

+ was_switched_all = scx_switched_all();
+
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);

@@ -5889,10 +5892,34 @@ static void scx_root_disable(struct scx_sched *sch)
/*
* Invalidate all the rq clocks to prevent getting outdated
* rq clocks from a previous scx scheduler.
+ *
+ * Also re-balance the dl_server bandwidth reservations: detach
+ * ext_server (no more sched_ext tasks) and reinstate fair_server if it
+ * was previously detached because we were running in full mode.
+ *
+ * Unlike the enable path, this runs on a recovery path that cannot
+ * fail, so we use dl_server_swap_bw() to atomically free ext_server's
+ * bandwidth and reclaim it for fair_server under the same dl_b lock.
+ *
+ * The swap can still fail with -EBUSY if someone bumped ext_server's
+ * runtime via debugfs between enable and disable; in that narrow case
+ * both servers end up detached and we just WARN.
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+
scx_rq_clock_invalidate(rq);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ update_rq_clock(rq);
+ if (was_switched_all) {
+ if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server,
+ &rq->fair_server)))
+ pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
+ } else {
+ dl_server_detach_bw(&rq->ext_server);
+ }
+ }
}

/* no task is on scx, turn off all the switches and flush in-progress calls */
@@ -6811,6 +6838,31 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_disable;

/*
+ * Attach the ext_server bandwidth reservation before anything is
+ * committed so that we can fail the enable if the root domain cannot
+ * accommodate it. The matching fair_server detach is deferred to the
+ * tail of this function, after the switch is fully committed and can no
+ * longer fail.
+ *
+ * On failure, err_disable funnels into scx_root_disable() which
+ * detaches ext_server, so partially-attached state is cleaned up
+ * automatically.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ update_rq_clock(rq);
+ ret = dl_server_attach_bw(&rq->ext_server);
+ }
+ if (ret) {
+ pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n",
+ cpu, ret);
+ goto err_disable;
+ }
+ }
+
+ /*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
* scheduling) may not function correctly before all tasks are switched.
@@ -6926,6 +6978,25 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
static_branch_enable(&__scx_switched_all);

+ /*
+ * Detach the fair_server bandwidth reservation now that the switch
+ * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no
+ * task will ever run in the fair class, so give that bandwidth
+ * back to the RT class. The matching ext_server attach already
+ * happened earlier; this only releases bandwidth and cannot fail.
+ *
+ * In partial mode keep fair_server attached.
+ */
+ if (scx_switched_all()) {
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ dl_server_detach_bw(&rq->fair_server);
+ }
+ }
+
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
sch->ops.name, scx_switched_all() ? "" : " (partial)");
kobject_uevent(&sch->kobj, KOBJ_ADD);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6b48bb3..332ecf8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -421,6 +421,10 @@ extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
+extern int dl_server_attach_bw(struct sched_dl_entity *dl_se);
+extern void dl_server_detach_bw(struct sched_dl_entity *dl_se);
+extern int dl_server_swap_bw(struct sched_dl_entity *detach_se,
+ struct sched_dl_entity *attach_se);

static inline bool dl_server_active(struct sched_dl_entity *dl_se)
{