[PATCH v2 2/2] cgroup/cpuset: align DL bandwidth reservation with attach target mask

From: Guopeng Zhang

Date: Thu May 07 2026 - 06:34:55 EST

cpuset_can_attach() preallocates destination SCHED_DEADLINE bandwidth
before the attach commit point, while set_cpus_allowed_dl() later
subtracts bandwidth from the source root domain when the task affinity is
actually updated.

Those two decisions must be made with the same CPU mask.
cpuset_can_attach() used the destination cpuset effective mask directly,
but cpuset_attach_task() first builds a per-task target mask which is
constrained by task_cpu_possible_mask() and, if needed, by walking up the
cpuset hierarchy. On asymmetric systems, the actual target mask can
therefore be a strict subset of cs->effective_cpus.

If the source root domain intersects cs->effective_cpus only on CPUs
outside the task's possible mask, can_attach() can skip the destination
reservation even though set_cpus_allowed_dl() later sees a real
root-domain move and subtracts from the source domain.

Extract the root-domain bandwidth-move test used by
set_cpus_allowed_dl() into dl_task_needs_bw_move(), and make
cpuset_can_attach() compute the same per-task target mask that
cpuset_attach_task() applies.

Keep nr_migrate_dl_tasks counting all migrating deadline tasks for
cpuset DL task accounting. Restrict sum_migrate_dl_bw to the subset of
tasks that need destination root-domain bandwidth reservation, because a
deadline task can move between cpusets without moving bandwidth between
root domains.

This keeps the existing per-attach aggregate reservation model; it only
changes the per-task mask used to decide which tasks contribute to that
aggregate. The broader can_attach()/attach() transaction window is left
unchanged.

Fixes: 431c69fac05b ("cpuset: Honour task_cpu_possible_mask() in guarantee_online_cpus()")
Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails")
Signed-off-by: Guopeng Zhang <zhangguopeng@xxxxxxxxxx>
---
include/linux/sched/deadline.h | 9 +++
kernel/cgroup/cpuset-internal.h | 1 +
kernel/cgroup/cpuset.c | 97 ++++++++++++++++++++++-----------
kernel/sched/deadline.c | 13 ++++-
4 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 1198138cb839..ddfd5216f3fc 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -33,6 +33,15 @@ struct root_domain;
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
extern void dl_clear_root_domain_cpu(int cpu);
+/*
+ * Return whether moving DL task @p to @new_mask requires moving DL
+ * bandwidth accounting between root domains. This helper is specific to
+ * DL bandwidth move accounting semantics and is shared by
+ * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the
+ * same source root-domain test.
+ */
+bool dl_task_needs_bw_move(struct task_struct *p,
+ const struct cpumask *new_mask);

extern u64 dl_cookie;
extern bool dl_bw_visited(int cpu, u64 cookie);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index bb4e692bea30..f7aaf01f7cd5 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -167,6 +167,7 @@ struct cpuset {
*/
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
+ /* DL bandwidth that needs destination reservation for this attach. */
u64 sum_migrate_dl_bw;
/*
* CPU used for temporary DL bandwidth allocation during attach;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae41736399a1..78c1a4071cc3 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -485,6 +485,30 @@ static void guarantee_active_cpus(struct task_struct *tsk,
rcu_read_unlock();
}

+/* Compute the effective CPU mask cpuset_attach_task() will apply to @tsk. */
+static void cpuset_attach_task_cpus(struct cpuset *cs, struct task_struct *tsk,
+ struct cpumask *pmask)
+{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ lockdep_assert_cpuset_lock_held();
+
+ if (cs == &top_cpuset) {
+ cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
+ return;
+ }
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+ cpumask_copy(pmask, cpu_active_mask);
+
+ rcu_read_lock();
+ while (!cpumask_intersects(cs->effective_cpus, pmask))
+ cs = parent_cs(cs);
+
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+ rcu_read_unlock();
+}
+
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
@@ -2986,6 +3010,14 @@ static void reset_migrate_dl_data(struct cpuset *cs)
cs->dl_bw_cpu = -1;
}

+/*
+ * Protected by cpuset_mutex. cpus_attach is used by the can_attach/attach
+ * paths but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
@@ -2993,7 +3025,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cpuset *cs, *oldcs;
struct task_struct *task;
bool setsched_check;
- int ret;
+ int cpu = nr_cpu_ids, ret;

/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
@@ -3038,32 +3070,47 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
}

if (dl_task(task)) {
+ /*
+ * Count all migrating DL tasks for cpuset task accounting.
+ * Only tasks that need a root-domain bandwidth move
+ * contribute to sum_migrate_dl_bw.
+ */
cs->nr_migrate_dl_tasks++;
- cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ cpuset_attach_task_cpus(cs, task, cpus_attach);
+
+ if (dl_task_needs_bw_move(task, cpus_attach)) {
+ /*
+ * Keep the existing aggregate reservation model.
+ * Tasks in one attach enter the same destination
+ * cpuset, so the first CPU found for a task needing
+ * DL bandwidth reservation identifies the destination
+ * root domain.
+ */
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_any_and(cpu_active_mask,
+ cpus_attach);
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ }
}
}

- if (!cs->nr_migrate_dl_tasks)
+ if (!cs->sum_migrate_dl_bw)
goto out_success;

- if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
-
- if (unlikely(cpu >= nr_cpu_ids)) {
- reset_migrate_dl_data(cs);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret) {
- reset_migrate_dl_data(cs);
- goto out_unlock;
- }
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
+ goto out_unlock;
+ }

- cs->dl_bw_cpu = cpu;
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
+ goto out_unlock;
}

+ cs->dl_bw_cpu = cpu;
+
out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
@@ -3099,23 +3146,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
mutex_unlock(&cpuset_mutex);
}

-/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
- * but we can't allocate it dynamically there. Define it global and
- * allocate from cpuset_init().
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_to;
-
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
lockdep_assert_cpuset_lock_held();

- if (cs != &top_cpuset)
- guarantee_active_cpus(task, cpus_attach);
- else
- cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
- subpartitions_cpus);
+ cpuset_attach_task_cpus(cs, task, cpus_attach);
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index edca7849b165..7db4c87df83b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
struct affinity_context *ctx)
{
- struct root_domain *src_rd;
struct rq *rq;

WARN_ON_ONCE(!dl_task(p));

rq = task_rq(p);
- src_rd = rq->rd;
/*
* Migrating a SCHED_DEADLINE task between exclusive
* cpusets (different root_domains) entails a bandwidth
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
+ if (dl_task_needs_bw_move(p, ctx->new_mask)) {
struct dl_bw *src_dl_b;

src_dl_b = dl_bw_of(cpu_of(rq));
@@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p,
set_cpus_allowed_common(p, ctx);
}

+bool dl_task_needs_bw_move(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ if (!dl_task(p))
+ return false;
+
+ return !cpumask_intersects(task_rq(p)->rd->span, new_mask);
+}
+
/* Assumes rq->lock is held */
static void rq_online_dl(struct rq *rq)
{
--
2.43.0