[PATCH 1/2] sched_ext: Auto-register/unregister dl_server reservations

From: Andrea Righi

Date: Thu May 21 2026 - 14:38:37 EST


Commit cd959a3562050d ("sched_ext: Add a DL server for sched_ext tasks")
introduced an ext_server deadline server to protect sched_ext tasks from
fair/RT starvation, mirroring the existing fair_server.

Currently, both servers reserve their 50ms/1000ms bandwidth at boot,
regardless of whether a BPF scheduler is loaded. Unused bandwidth is
still reclaimed at runtime by other classes, but the static reservation
prevents the RT class from implicitly using that headroom when one of
the two classes is guaranteed to be empty.

A sysadmin can work around this by writing
/sys/kernel/debug/sched/{fair,ext}_server/cpu*/runtime, but that
requires manual action and not all systems expose debugfs.

A better approach is to make server bandwidth reservations dynamic: only
the scheduling policy that is currently active should register its
reservation, while the inactive one should not artificially hold
capacity (keeping both reservations only when the BPF scheduler is
running in partial mode):

+---------------------------------------------+-------------+------------+
| BPF scheduler state | fair server | ext server |
+---------------------------------------------+-------------+------------+
| not loaded (default boot) | reserved | none |
| loaded full mode (!SCX_OPS_SWITCH_PARTIAL) | none | reserved |
| loaded partial mode (SCX_OPS_SWITCH_PARTIAL)| reserved | reserved |
+---------------------------------------------+-------------+------------+

To achieve this, introduce an "attached/detached" state for each
deadline server, so the kernel can decide whether a server's bandwidth
should be accounted in global bandwidth tracking.

At boot, the system starts with only the fair server contributing to
bandwidth accounting. When a BPF scheduler is enabled, the ext server is
attached and may replace or complement the fair server depending on
whether full or partial mode is used. When sched_ext is disabled, the
system restores the previous deadline bandwidth values and behavior.

The transition logic ensures that switching between scheduling modes is
consistent and reversible, without losing runtime configuration or
requiring manual intervention.

Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
include/linux/sched.h | 6 +++
kernel/sched/deadline.c | 109 +++++++++++++++++++++++++++++++++++++---
kernel/sched/ext.c | 43 ++++++++++++++++
kernel/sched/sched.h | 2 +
4 files changed, 153 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 004e6d56a499a..6d55b2ed9651a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -702,6 +702,11 @@ struct sched_dl_entity {
* running, skipping the defer phase.
*
* @dl_defer_idle tracks idle state
+ *
+ * @dl_bw_attached tells if this server's bandwidth currently
+ * contributes to the root domain's total_bw. Only meaningful for server
+ * entities (@dl_server == 1). Allows toggling the reservation on/off
+ * without losing the configured @dl_runtime/@dl_period.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
@@ -713,6 +718,7 @@ struct sched_dl_entity {
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
unsigned int dl_defer_idle : 1;
+ unsigned int dl_bw_attached : 1;

/*
* Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index edca7849b165d..08b8a8c23e80b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1868,6 +1868,13 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+ /*
+ * No BPF scheduler is loaded at boot, so the ext_server has no
+ * tasks to protect. Detach its bandwidth reservation, it will
+ * be re-attached when a BPF scheduler is loaded.
+ */
+ dl_server_detach_bw(dl_se);
#endif
}
}
@@ -1878,6 +1885,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
int cpu = cpu_of(rq);
struct dl_bw *dl_b;

+ if (!dl_se->dl_bw_attached)
+ return;
+
dl_b = dl_bw_of(cpu_of(rq));
guard(raw_spinlock)(&dl_b->lock);

@@ -1903,16 +1913,21 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
cpus = dl_bw_cpus(cpu);
cap = dl_bw_capacity(cpu);

- if (__dl_overflow(dl_b, cap, old_bw, new_bw))
- return -EBUSY;
+ if (init || dl_se->dl_bw_attached) {
+ if (__dl_overflow(dl_b, cap, old_bw, new_bw))
+ return -EBUSY;
+ }

if (init) {
__add_rq_bw(new_bw, &rq->dl);
__dl_add(dl_b, new_bw, cpus);
- } else {
+ dl_se->dl_bw_attached = 1;
+ } else if (dl_se->dl_bw_attached) {
__dl_sub(dl_b, dl_se->dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);

+ dl_rq_change_utilization(rq, dl_se, new_bw);
+ } else {
dl_rq_change_utilization(rq, dl_se, new_bw);
}

@@ -1929,6 +1944,84 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
return 0;
}

+/*
+ * Attach @dl_se's bandwidth to the root domain's total_bw accounting.
+ *
+ * Use to dynamically register a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * already attached.
+ *
+ * Returns -EBUSY if attaching would overflow the root domain capacity.
+ */
+int dl_server_attach_bw(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ unsigned long cap;
+ int cpus;
+
+ if (dl_se->dl_bw_attached)
+ return 0;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+
+ cpus = dl_bw_cpus(cpu);
+
+ /*
+ * If there's no active CPU in this root domain (e.g., @cpu is offline),
+ * just record the intent, so dl_server_add_bw() honors it when CPUs
+ * come back.
+ */
+ if (!cpus) {
+ dl_se->dl_bw_attached = 1;
+ return 0;
+ }
+
+ cap = dl_bw_capacity(cpu);
+
+ if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw))
+ return -EBUSY;
+
+ __dl_add(dl_b, dl_se->dl_bw, cpus);
+ dl_se->dl_bw_attached = 1;
+
+ return 0;
+}
+
+/*
+ * Detach @dl_se's bandwidth from the root domain's total_bw accounting.
+ *
+ * Use to dynamically unregister a dl_server's bandwidth reservation while
+ * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is
+ * not currently attached.
+ */
+void dl_server_detach_bw(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ int cpus;
+
+ if (!dl_se->dl_bw_attached)
+ return;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+
+ cpus = dl_bw_cpus(cpu);
+
+ /*
+ * If no active CPUs in this root domain, the bandwidth isn't in
+ * @dl_b right now; only clear the flag so dl_server_add_bw() skips
+ * the server on the next root-domain rebuild.
+ */
+ if (cpus)
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ dl_se->dl_bw_attached = 0;
+}
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -3229,12 +3322,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu)
struct sched_dl_entity *dl_se;

dl_se = &cpu_rq(cpu)->fair_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));

#ifdef CONFIG_SCHED_CLASS_EXT
dl_se = &cpu_rq(cpu)->ext_server;
- if (dl_server(dl_se) && cpu_active(cpu))
+ if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu))
__dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
#endif
}
@@ -3243,11 +3336,13 @@ static u64 dl_server_read_bw(int cpu)
{
u64 dl_bw = 0;

- if (cpu_rq(cpu)->fair_server.dl_server)
+ if (cpu_rq(cpu)->fair_server.dl_server &&
+ cpu_rq(cpu)->fair_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->fair_server.dl_bw;

#ifdef CONFIG_SCHED_CLASS_EXT
- if (cpu_rq(cpu)->ext_server.dl_server)
+ if (cpu_rq(cpu)->ext_server.dl_server &&
+ cpu_rq(cpu)->ext_server.dl_bw_attached)
dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
#endif

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9c458552d14ff..15ba49fcba9af 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6061,6 +6061,7 @@ static void scx_root_disable(struct scx_sched *sch)
{
struct scx_task_iter sti;
struct task_struct *p;
+ bool was_switched_all;
int cpu;

/* guarantee forward progress and wait for descendants to be disabled */
@@ -6087,6 +6088,13 @@ static void scx_root_disable(struct scx_sched *sch)
*/
mutex_lock(&scx_enable_mutex);

+ /*
+ * Snapshot the full vs partial mode before clearing the static
+ * branch, so the dl_server re-balance below knows whether the
+ * fair_server reservation needs to be reinstated.
+ */
+ was_switched_all = scx_switched_all();
+
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);

@@ -6136,10 +6144,24 @@ static void scx_root_disable(struct scx_sched *sch)
/*
* Invalidate all the rq clocks to prevent getting outdated
* rq clocks from a previous scx scheduler.
+ *
+ * Also re-balance the dl_server bandwidth reservations: detach
+ * ext_server (no more sched_ext tasks) and reinstate fair_server
+ * if it was previously detached because we were running in full
+ * mode. Detach before attach to avoid a transient overflow of the
+ * root domain's bandwidth capacity.
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+
scx_rq_clock_invalidate(rq);
+
+ scoped_guard(rq_lock_irqsave, rq) {
+ dl_server_detach_bw(&rq->ext_server);
+ if (was_switched_all &&
+ WARN_ON_ONCE(dl_server_attach_bw(&rq->fair_server)))
+ pr_warn("failed to re-attach fair_server on CPU %d\n", cpu);
+ }
}

/* no task is on scx, turn off all the switches and flush in-progress calls */
@@ -7314,6 +7336,27 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
static_branch_enable(&__scx_switched_all);

+ /*
+ * Re-balance the dl_server bandwidth reservations.
+ *
+ * In full mode (!SCX_OPS_SWITCH_PARTIAL) no task will ever run in
+ * the fair class, so detach the fair_server reservation and give
+ * that bandwidth back to the RT class. Always attach the
+ * ext_server reservation since sched_ext tasks are now possible.
+ *
+ * Detach before attach to avoid a transient overflow of the root
+ * domain's bandwidth capacity.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ if (scx_switched_all())
+ dl_server_detach_bw(&rq->fair_server);
+ if (WARN_ON_ONCE(dl_server_attach_bw(&rq->ext_server)))
+ pr_warn("failed to attach ext_server on CPU %d\n", cpu);
+ }
+
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
sch->ops.name, scx_switched_all() ? "" : " (partial)");
kobject_uevent(&sch->kobj, KOBJ_ADD);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f63b15d309d1..1e233618257ac 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -421,6 +421,8 @@ extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
+extern int dl_server_attach_bw(struct sched_dl_entity *dl_se);
+extern void dl_server_detach_bw(struct sched_dl_entity *dl_se);

static inline bool dl_server_active(struct sched_dl_entity *dl_se)
{
--
2.54.0