[PATCH v7 9/9] cgroup/cpuset: Support multiple destination cpusets for cpuset_*attach()
From: Waiman Long
Date: Sat Jun 20 2026 - 23:30:48 EST
The only case where the cgroup_taskset structure requires task migration
to multiple cpusets is when enabling a cpuset controller in cgroup v2
where the newly created child cpusets inherits the same effective CPUs
and memory nodes from the parent. In that case, task migration can happen
directly with no update to tasks' CPU and memory nodes assignment and no
further work needed from the cpuset side exact updating nr_deadline_tasks
when DL tasks are involved and setting old_mems_allowed in the child
cpusets.
Do that by tracking all the destination cpusets with a new dst_cs_head
singly linked list again with the setting of attach_in_progress
associated with the insertion into the list.
It is assumed that a given cpuset cannot be both a source and a
destination cpuset. If such condition happens or when there are multiple
destination cpusets with CPU or memory nodes changes, the current code
will not handle it correctly. So it will print a warning and fail the
attach operation in these unexpected cases as we will have to enhance
the code to support this if such use cases are valid and not coding bugs.
Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/cgroup/cpuset-internal.h | 1 +
kernel/cgroup/cpuset.c | 121 +++++++++++++++++++-------------
2 files changed, 75 insertions(+), 47 deletions(-)
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 011993b1f756..900e74ac3538 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -151,6 +151,7 @@ struct cpuset {
*/
struct llist_node attach_node;
int attach_in_progress;
+ bool attach_source;
/* partition root state */
int partition_root_state;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c2d172873166..aff86acea701 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2986,11 +2986,16 @@ static int update_prstate(struct cpuset *cs, int new_prs)
*
* The attach_cpus_updated/attach_mems_updated flags are set in either
* cpuset_can_attach() or cpuset_fork() and used in cpuset_attach_task().
+ *
+ * The attach_many_dest_cs is set when there are multiple destination cpusets
+ * for task migration.
*/
static struct cpuset *cpuset_attach_old_cs;
static LLIST_HEAD(src_cs_head);
+static LLIST_HEAD(dst_cs_head);
static bool attach_cpus_updated;
static bool attach_mems_updated;
+static bool attach_many_dest_cs;
/*
* Check to see if a cpuset can accept a new task
@@ -3013,9 +3018,25 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
if (!oldcs)
return 0;
+ /*
+ * The same cpuset cannot be both a source and a destination.
+ * The current code does not support that, print a warning and
+ * fail the attach if so.
+ */
+ if (WARN_ON_ONCE((!oldcs->attach_source &&
+ llist_on_list(&oldcs->attach_node)) ||
+ cs->attach_source))
+ return -EINVAL;
+
if (!llist_on_list(&oldcs->attach_node)) {
llist_add(&oldcs->attach_node, &src_cs_head);
oldcs->attach_in_progress++;
+ oldcs->attach_source = true;
+ }
+
+ if (!llist_on_list(&cs->attach_node)) {
+ llist_add(&cs->attach_node, &dst_cs_head);
+ cs->attach_in_progress++;
}
cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
@@ -3046,35 +3067,31 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
return 0;
}
-static int cpuset_reserve_dl_bw(struct cpuset *cs)
+static int cpuset_reserve_dl_bw(void)
{
+ struct cpuset *cs;
int cpu, ret;
- if (!cs->sum_migrate_dl_bw)
- return 0;
+ llist_for_each_entry(cs, dst_cs_head.first, attach_node) {
+ if (!cs->sum_migrate_dl_bw)
+ continue;
- cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
- if (unlikely(cpu >= nr_cpu_ids))
- return -EINVAL;
+ cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+ if (unlikely(cpu >= nr_cpu_ids))
+ return -EINVAL;
- ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret)
- return ret;
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret)
+ return ret;
- cs->dl_bw_cpu = cpu;
+ cs->dl_bw_cpu = cpu;
+ }
return 0;
}
-static void reset_migrate_dl_data(struct cpuset *cs)
-{
- cs->nr_migrate_dl_tasks = 0;
- cs->sum_migrate_dl_bw = 0;
- cs->dl_bw_cpu = -1;
-}
-
/*
* Clear and optionally apply (@cancel is false) the attach related data in the
- * source cpusets.
+ * source or destination cpuset.
*/
static void clear_attach_data(struct llist_head *head, bool cancel)
{
@@ -3086,9 +3103,14 @@ static void clear_attach_data(struct llist_head *head, bool cancel)
if (cs->nr_migrate_dl_tasks) {
if (!cancel)
cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ else if (cs->dl_bw_cpu >= 0) /* && cacnel */
+ dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
+ cs->dl_bw_cpu = -1;
}
dec_attach_in_progress_locked(cs);
+ cs->attach_source = false;
}
}
@@ -3109,6 +3131,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
attach_cpus_updated = false;
attach_mems_updated = false;
+ attach_many_dest_cs = false;
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
@@ -3133,9 +3156,13 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* selected as cpuset_attach_old_cs.
*/
cgroup_taskset_for_each(task, css, tset) {
+ struct cpuset *new_cs = css_cs(css);
struct cpuset *new_oldcs = task_cs(task);
- if (new_oldcs != oldcs) {
+ if ((new_oldcs != oldcs) || (new_cs != cs)) {
+ if (new_cs != cs)
+ attach_many_dest_cs = true;
+ cs = new_cs;
oldcs = new_oldcs;
ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
if (ret)
@@ -3169,14 +3196,28 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
}
}
- ret = cpuset_reserve_dl_bw(cs);
+ /*
+ * The only case where there are multiple destination cpusets for
+ * task migration is when enabling a v2 cpuset controllers where
+ * tasks will be migrated to multiple child cpusets from a parent
+ * cpuset with the same effective CPUs and memory nodes. IOW,
+ * both attach_cpus_updated and attach_mems_updated should be false.
+ * If not, it is a condition that the current code cannot handled.
+ * Print a warning and abort the attach operation as further code
+ * change will be needed.
+ */
+ if (WARN_ON_ONCE(attach_many_dest_cs && (!cpuset_v2() ||
+ attach_cpus_updated || attach_mems_updated))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = cpuset_reserve_dl_bw();
out_unlock:
if (ret) {
- reset_migrate_dl_data(cs);
clear_attach_data(&src_cs_head, true);
- } else {
- cs->attach_in_progress++;
+ clear_attach_data(&dst_cs_head, true);
}
mutex_unlock(&cpuset_mutex);
@@ -3185,22 +3226,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
- struct cgroup_subsys_state *css;
- struct cpuset *cs;
-
- cgroup_taskset_first(tset, &css);
- cs = css_cs(css);
-
mutex_lock(&cpuset_mutex);
- dec_attach_in_progress_locked(cs);
clear_attach_data(&src_cs_head, true);
-
- if (cs->dl_bw_cpu >= 0)
- dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
-
- if (cs->nr_migrate_dl_tasks)
- reset_migrate_dl_data(cs);
-
+ clear_attach_data(&dst_cs_head, true);
mutex_unlock(&cpuset_mutex);
}
@@ -3286,26 +3314,25 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* In the default hierarchy, enabling cpuset in the child cgroups
* will trigger a cpuset_attach() call with no change in effective cpus
* and mems. In that case, we can optimize out by skipping the task
- * iteration and update.
+ * iteration and update, but the destination cpuset list is iterated to
+ * set old_mems_allowed.
*/
- if (cpuset_v2() && !attach_cpus_updated && !attach_mems_updated)
+ if (cpuset_v2() && !attach_cpus_updated && !attach_mems_updated) {
+ llist_for_each_entry(cs, dst_cs_head.first, attach_node)
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
goto out;
+ }
+ /* Task iteration shouldn't happen with attach_many_dest_cs set */
cgroup_taskset_for_each(task, css, tset)
cpuset_attach_task(cs, task);
-out:
if (queue_task_work)
schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
-
- if (cs->nr_migrate_dl_tasks) {
- cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
- reset_migrate_dl_data(cs);
- }
-
+out:
clear_attach_data(&src_cs_head, false);
- dec_attach_in_progress_locked(cs);
+ clear_attach_data(&dst_cs_head, false);
mutex_unlock(&cpuset_mutex);
}
--
2.54.0