[RFC PATCH v6 14/25] sched/rt: Implement dl-server operations for rt-cgroups.

From: Yuri Andriaccio

Date: Mon Jun 08 2026 - 08:24:32 EST

Implement rt_server_pick, the callback that deadline servers use to
pick a task to schedule.
rt_server_pick(): pick the next runnable rt task and tell the
scheduler that it is going to be scheduled next.

Let enqueue_task_rt function start the attached deadline server when the
first task is enqueued on a specific rq/server.
The server is not symmetrically stopped in dequeue_task_rt as it is
stopped when server_pick_task returns NULL (see deadline.c).

Change update_curr_rt to perform a deadline server update if the
updated task is served by non-root group.

Update {enqueue/dequeue}_pushable_task and rt_{set/clear}_overload to
only set the CPU-wise overload flag only if the root runqueues are
overloaded, but not for HCBS runqueues.

Update inc/dec_dl_tasks to account the number of active tasks in the
local runqueue for rt-cgroups servers, as their local runqueue is
different from the global runqueue, and thus when a rt-group server is
activated/deactivated, the number of served tasks' must be
added/removed. This uses nr_running to be compatible with future
dl-server interfaces. Account also the deadline server so that it is
picked for shutdown when its runqueue is empty (future patches will
try to pull tasks before stopping).

Update inc/dec_rt_prio_smp to change a rq's cpupri only if the rt_rq
is the global runqueue, since cgroups are scheduled via their
dl-server priority.

Update inc/dec_rt_tasks to account for waking/sleeping tasks on the
global runqueue, when the task runs on the root cgroup, or its local
dl server is active. The accounting is not done when servers are
throttled, as they will add/sub the number of tasks running when they
get enqueued/dequeued. For rt cgroups, account for the number of active
tasks in the nr_running field of the local runqueue (add/sub_nr_running),
as this number is used when a dl server is enqueued/dequeued.

Update set_task_rq to record the rt_rq of the cgroup's active_context,
tracking where to schedule the given task.

Update set_task_rq to record the dl_rq, tracking which deadline
server manages a task.

Update set_task_rq to not use the parent field anymore, as it is
unused by this patchset's code. Remove the unused parent field from
sched_rt_entity.

Co-developed-by: Alessio Balsini <a.balsini@xxxxxxxx>
Signed-off-by: Alessio Balsini <a.balsini@xxxxxxxx>
Co-developed-by: Andrea Parri <parri.andrea@xxxxxxxxx>
Signed-off-by: Andrea Parri <parri.andrea@xxxxxxxxx>
Co-developed-by: luca abeni <luca.abeni@xxxxxxxxxxxxxxx>
Signed-off-by: luca abeni <luca.abeni@xxxxxxxxxxxxxxx>
Signed-off-by: Yuri Andriaccio <yurand2000@xxxxxxxxx>
---
include/linux/sched.h | 1 -
kernel/sched/deadline.c | 8 +++++
kernel/sched/rt.c | 70 ++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 11 +++++--
4 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 411ffe9b34b3..b20451fcda55 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -630,7 +630,6 @@ struct sched_rt_entity {

struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 166d23f45cab..a63253ec6441 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2096,6 +2096,10 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)

if (!dl_server(dl_se))
add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ else if (rq_of_dl_se(dl_se) != dl_se->my_q) {
+ WARN_ON(dl_se->my_q->rt.rt_nr_running != dl_se->my_q->nr_running);
+ add_nr_running(rq_of_dl_rq(dl_rq), dl_se->my_q->nr_running + 1);
+ }

inc_dl_deadline(dl_rq, deadline);
}
@@ -2108,6 +2112,10 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)

if (!dl_server(dl_se))
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ else if (rq_of_dl_se(dl_se) != dl_se->my_q) {
+ WARN_ON(dl_se->my_q->rt.rt_nr_running != dl_se->my_q->nr_running);
+ sub_nr_running(rq_of_dl_rq(dl_rq), dl_se->my_q->nr_running - 1);
+ }

dec_dl_deadline(dl_rq, dl_se->deadline);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a6adf21772a6..61e9dab894d1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -284,9 +284,19 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}

+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq);
+
static struct task_struct *rt_server_pick(struct sched_dl_entity *dl_se, struct rq_flags *rf)
{
- return NULL;
+ struct rt_rq *rt_rq = &dl_se->my_q->rt;
+ struct task_struct *p;
+
+ if (!sched_rt_runnable(dl_se->my_q))
+ return NULL;
+
+ p = rt_task_of(pick_next_rt_entity(rt_rq));
+
+ return p;
}

#else /* !CONFIG_RT_GROUP_SCHED */
@@ -314,6 +324,9 @@ static inline int rt_overloaded(struct rq *rq)

static inline void rt_set_overload(struct rq *rq)
{
+ if (is_dl_group(&rq->rt))
+ return;
+
if (!rq->online)
return;

@@ -333,6 +346,9 @@ static inline void rt_set_overload(struct rq *rq)

static inline void rt_clear_overload(struct rq *rq)
{
+ if (is_dl_group(&rq->rt))
+ return;
+
if (!rq->online)
return;

@@ -392,7 +408,7 @@ static void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
rt_rq->highest_prio.next = p->prio;

if (!rt_rq->overloaded) {
- rt_set_overload(global_rq_of_rt_rq(rt_rq));
+ rt_set_overload(rq_of_rt_rq(rt_rq));
rt_rq->overloaded = 1;
}
}
@@ -410,7 +426,7 @@ static void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
rt_rq->highest_prio.next = MAX_RT_PRIO-1;

if (rt_rq->overloaded) {
- rt_clear_overload(global_rq_of_rt_rq(rt_rq));
+ rt_clear_overload(rq_of_rt_rq(rt_rq));
rt_rq->overloaded = 0;
}
}
@@ -511,6 +527,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *donor = rq->donor;
+ struct rt_rq *rt_rq;
s64 delta_exec;

if (donor->sched_class != &rt_sched_class)
@@ -520,21 +537,32 @@ static void update_curr_rt(struct rq *rq)
if (unlikely(delta_exec <= 0))
return;

- if (!rt_bandwidth_enabled())
+ if (!rt_group_sched_enabled())
+ return;
+
+ if (!dl_bandwidth_enabled())
return;
+
+ rt_rq = rt_rq_of_se(&donor->rt);
+ if (is_dl_group(rt_rq)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ dl_server_update(dl_se, delta_exec);
+ }
}

static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq *rq;

/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
+ if (is_dl_group(rt_rq))
return;

+ rq = rq_of_rt_rq(rt_rq);
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -542,14 +570,15 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
static void
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq *rq;

/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
+ if (is_dl_group(rt_rq))
return;

+ rq = rq_of_rt_rq(rt_rq);
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
@@ -610,6 +639,15 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += is_rr_task(rt_se);

inc_rt_prio(rt_rq, rt_se_prio(rt_se));
+
+ if (is_dl_group(rt_rq)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ if (!dl_se->dl_throttled)
+ add_nr_running(global_rq_of_rt_rq(rt_rq), 1);
+ }
+
+ add_nr_running(rq_of_rt_rq(rt_rq), 1);
}

static inline
@@ -620,6 +658,15 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= is_rr_task(rt_se);

dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+
+ if (is_dl_group(rt_rq)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ if (!dl_se->dl_throttled)
+ sub_nr_running(global_rq_of_rt_rq(rt_rq), 1);
+ }
+
+ sub_nr_running(rq_of_rt_rq(rt_rq), 1);
}

/*
@@ -806,6 +853,13 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
check_schedstat_required();
update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);

+ /* Task arriving in an idle group of tasks. */
+ if (is_dl_group(rt_rq) && rt_rq->rt_nr_running == 0) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ dl_server_start(dl_se);
+ }
+
enqueue_rt_entity(rt_se, flags);

if (task_is_blocked(p))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58f67093145e..66d5bd1aa4f1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2310,10 +2310,11 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
* root_task_group's rt_rq than switching in rt_rq_of_se()
* Clobbers tg(!)
*/
+ guard(raw_spinlock_irqsave)(&tg->dl_bandwidth.dl_runtime_lock);
if (!rt_group_sched_enabled())
tg = &root_task_group;
- p->rt.rt_rq = tg->rt_rq[cpu];
- p->rt.parent = tg->rt_se[cpu];
+ p->rt.rt_rq = tg->dl_bandwidth.active_context->rt_rq[cpu];
+ p->dl.dl_rq = &cpu_rq(cpu)->dl;
#endif /* CONFIG_RT_GROUP_SCHED */
}

@@ -2976,6 +2977,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
unsigned prev_nr = rq->nr_running;

rq->nr_running = prev_nr + count;
+ if (rq != cpu_rq(rq->cpu))
+ return;
+
if (trace_sched_update_nr_running_tp_enabled()) {
call_trace_sched_update_nr_running(rq, count);
}
@@ -2989,6 +2993,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ if (rq != cpu_rq(rq->cpu))
+ return;
+
if (trace_sched_update_nr_running_tp_enabled()) {
call_trace_sched_update_nr_running(rq, -count);
}
--
2.54.0