[PATCH] cgroup/cpuset: make DL attach bandwidth reservation root-domain aware

From: Guopeng Zhang

Date: Tue Apr 21 2026 - 04:35:44 EST

cpuset_can_attach() currently sums the bandwidth of all migrating
SCHED_DEADLINE tasks and reserves destination bandwidth whenever the
old and new cpuset effective CPU masks do not overlap.

That condition is stronger than what the scheduler uses when migrating
a deadline task. set_cpus_allowed_dl() only subtracts bandwidth from
the source side when moving the task requires a DL bandwidth move
between root domains.

As a result, moving a deadline task between disjoint member cpusets that
still belong to the same root domain can reserve destination bandwidth
even though no matching source-side subtraction happens. Successful
back-and-forth migrations between such cpusets can monotonically
increase dl_bw->total_bw.

Fix this by extracting the source root-domain test already used by
set_cpus_allowed_dl() into a shared helper and make cpuset DL bandwidth
preallocation use that same condition. Count all migrating deadline
tasks for cpuset task accounting, but only accumulate sum_migrate_dl_bw
for tasks that actually need a DL bandwidth move. Reserve and rollback
bandwidth only for that subset.

This keeps successful attach accounting aligned with
set_cpus_allowed_dl() and avoids double-accounting within a single
root domain.

Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails")
Signed-off-by: Guopeng Zhang <zhangguopeng@xxxxxxxxxx>
---
include/linux/sched/deadline.h | 9 +++++++++
kernel/cgroup/cpuset-internal.h | 1 +
kernel/cgroup/cpuset.c | 34 ++++++++++++++++-----------------
kernel/sched/deadline.c | 14 +++++++++++---
4 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 1198138cb839..273538200a44 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -33,6 +33,15 @@ struct root_domain;
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
extern void dl_clear_root_domain_cpu(int cpu);
+/*
+ * Return whether moving DL task @p to @new_mask requires moving DL
+ * bandwidth accounting between root domains. This helper is specific to
+ * DL bandwidth move accounting semantics and is shared by
+ * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the
+ * same source root-domain test.
+ */
+extern bool dl_task_needs_bw_move(struct task_struct *p,
+ const struct cpumask *new_mask);

extern u64 dl_cookie;
extern bool dl_bw_visited(int cpu, u64 cookie);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index bb4e692bea30..f7aaf01f7cd5 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -167,6 +167,7 @@ struct cpuset {
*/
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
+ /* DL bandwidth that needs destination reservation for this attach. */
u64 sum_migrate_dl_bw;
/*
* CPU used for temporary DL bandwidth allocation during attach;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e3a081a07c6d..761098b45f23 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2993,7 +2993,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cpuset *cs, *oldcs;
struct task_struct *task;
bool setsched_check;
- int ret;
+ int cpu, ret;

/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
@@ -3039,31 +3039,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)

if (dl_task(task)) {
cs->nr_migrate_dl_tasks++;
- cs->sum_migrate_dl_bw += task->dl.dl_bw;
+
+ if (dl_task_needs_bw_move(task, cs->effective_cpus))
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
}

- if (!cs->nr_migrate_dl_tasks)
+ if (!cs->sum_migrate_dl_bw)
goto out_success;

- if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+ cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);

- if (unlikely(cpu >= nr_cpu_ids)) {
- reset_migrate_dl_data(cs);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret) {
- reset_migrate_dl_data(cs);
- goto out_unlock;
- }
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
+ goto out_unlock;
+ }

- cs->dl_bw_cpu = cpu;
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
+ goto out_unlock;
}

+ cs->dl_bw_cpu = cpu;
+
out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index edca7849b165..5ddfa0d30bf6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
struct affinity_context *ctx)
{
- struct root_domain *src_rd;
struct rq *rq;

WARN_ON_ONCE(!dl_task(p));

rq = task_rq(p);
- src_rd = rq->rd;
/*
* Migrating a SCHED_DEADLINE task between exclusive
* cpusets (different root_domains) entails a bandwidth
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
+ if (dl_task_needs_bw_move(p, ctx->new_mask)) {
struct dl_bw *src_dl_b;

src_dl_b = dl_bw_of(cpu_of(rq));
@@ -3137,6 +3135,16 @@ static void set_cpus_allowed_dl(struct task_struct *p,
set_cpus_allowed_common(p, ctx);
}

+bool dl_task_needs_bw_move(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ if (!dl_task(p))
+ return false;
+
+ guard(rcu)();
+ return !cpumask_intersects(task_rq(p)->rd->span, new_mask);
+}
+
/* Assumes rq->lock is held */
static void rq_online_dl(struct rq *rq)
{
--
2.43.0