[PATCH v7 8/9] cgroup/cpuset: Support multiple source cpusets for cpuset_*attach()
From: Waiman Long
Date: Sat Jun 20 2026 - 23:30:20 EST
There are 2 possible scenarios where the cgroup_taskset structure
passed into the cgroup can_attach() and attach() methods can contain
task migration data with multiple source cpusets.
- A multithread application with threads in different cpusets is
fully migrated into a new cpuset.
- Disabling v2 cpuset controller will move all the tasks in child
cpusets to the parent cpuset.
The current cpuset_can_attach() and cpuset_attach() functions still
expect task migration is from one source cpuset to one destination
cpuset.
Fix that by tracking the set of source (old) cpusets in singly linked
lists with the setting of attach_in_progress flag associated with the
insertion into the list. The list will be iterated when necessary to
properly update the internal data.
To ensure proper DL tasks accounting, the nr_migrate_dl_tasks in both
the source and destination cpusets are decremented/incremented with
their values added to nr_deadline_tasks when the migration is successful.
The setting of the global attach_cpus_updated and attach_mems_updated
flags are also moved from cpuset_attach() to cpuset_can_attach() as the
correct source cpuset can no longer be determined in cpuset_attach()
and cpuset states will not be changed between cpuset_attach() and
cpuset_can_attach() with an earlier patch.
Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/cgroup/cpuset-internal.h | 1 +
kernel/cgroup/cpuset.c | 66 ++++++++++++++++++++++++++++-----
2 files changed, 57 insertions(+), 10 deletions(-)
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index f7aaf01f7cd5..011993b1f756 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -149,6 +149,7 @@ struct cpuset {
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
+ struct llist_node attach_node;
int attach_in_progress;
/* partition root state */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 511afb077e2d..c2d172873166 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -37,6 +37,7 @@
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/task_work.h>
+#include <linux/llist.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -584,6 +585,7 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
return NULL;
trial->dl_bw_cpu = -1;
+ init_llist_node(&trial->attach_node);
/* Setup cpumask pointer array */
cpumask_var_t *pmask[4] = {
@@ -2983,9 +2985,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
* Protected by cpuset_mutex
*
* The attach_cpus_updated/attach_mems_updated flags are set in either
- * cpuset_attach() or cpuset_fork() and used in cpuset_attach_task().
+ * cpuset_can_attach() or cpuset_fork() and used in cpuset_attach_task().
*/
static struct cpuset *cpuset_attach_old_cs;
+static LLIST_HEAD(src_cs_head);
static bool attach_cpus_updated;
static bool attach_mems_updated;
@@ -3001,6 +3004,8 @@ static bool attach_mems_updated;
static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
bool *psetsched)
{
+ bool cpus_updated, mems_updated;
+
if (cpumask_empty(cs->effective_cpus) ||
(!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
return -ENOSPC;
@@ -3008,14 +3013,25 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
if (!oldcs)
return 0;
+ if (!llist_on_list(&oldcs->attach_node)) {
+ llist_add(&oldcs->attach_node, &src_cs_head);
+ oldcs->attach_in_progress++;
+ }
+
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ if (cpus_updated)
+ attach_cpus_updated = true;
+ if (mems_updated)
+ attach_mems_updated = true;
+
/*
* Skip rights over task setsched check in v2 when nothing changes,
* migration permission derives from hierarchy ownership in
* cgroup_procs_write_permission()).
*/
- *psetsched = !cpuset_v2() ||
- !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
- !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+ *psetsched = !cpuset_v2() || cpus_updated || mems_updated;
/*
* A v1 cpuset with tasks will have no CPU left only when CPU hotplug
@@ -3056,6 +3072,26 @@ static void reset_migrate_dl_data(struct cpuset *cs)
cs->dl_bw_cpu = -1;
}
+/*
+ * Clear and optionally apply (@cancel is false) the attach related data in the
+ * source cpusets.
+ */
+static void clear_attach_data(struct llist_head *head, bool cancel)
+{
+ struct cpuset *cs, *next;
+ struct llist_node *lnode = __llist_del_all(head);
+
+ llist_for_each_entry_safe(cs, next, lnode, attach_node) {
+ init_llist_node(&cs->attach_node);
+ if (cs->nr_migrate_dl_tasks) {
+ if (!cancel)
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ cs->nr_migrate_dl_tasks = 0;
+ }
+ dec_attach_in_progress_locked(cs);
+ }
+}
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
@@ -3071,6 +3107,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cs = css_cs(css);
mutex_lock(&cpuset_mutex);
+ attach_cpus_updated = false;
+ attach_mems_updated = false;
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
@@ -3095,6 +3133,15 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* selected as cpuset_attach_old_cs.
*/
cgroup_taskset_for_each(task, css, tset) {
+ struct cpuset *new_oldcs = task_cs(task);
+
+ if (new_oldcs != oldcs) {
+ oldcs = new_oldcs;
+ ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = task_can_attach(task);
if (ret)
goto out_unlock;
@@ -3116,6 +3163,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* contribute to sum_migrate_dl_bw.
*/
cs->nr_migrate_dl_tasks++;
+ oldcs->nr_migrate_dl_tasks--;
if (dl_task_needs_bw_move(task, cs->effective_cpus))
cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
@@ -3126,9 +3174,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
out_unlock:
if (ret) {
reset_migrate_dl_data(cs);
+ clear_attach_data(&src_cs_head, true);
} else {
cs->attach_in_progress++;
- oldcs->attach_in_progress++;
}
mutex_unlock(&cpuset_mutex);
@@ -3145,6 +3193,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
+ clear_attach_data(&src_cs_head, true);
if (cs->dl_bw_cpu >= 0)
dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
@@ -3224,7 +3273,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct task_struct *task;
struct cgroup_subsys_state *css;
struct cpuset *cs;
- struct cpuset *oldcs = cpuset_attach_old_cs;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
@@ -3232,9 +3280,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
mutex_lock(&cpuset_mutex);
queue_task_work = false;
-
- attach_cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
- attach_mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
/*
@@ -3256,10 +3301,10 @@ static void cpuset_attach(struct cgroup_taskset *tset)
if (cs->nr_migrate_dl_tasks) {
cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
- oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
reset_migrate_dl_data(cs);
}
+ clear_attach_data(&src_cs_head, false);
dec_attach_in_progress_locked(cs);
mutex_unlock(&cpuset_mutex);
@@ -3777,6 +3822,7 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.effective_xcpus);
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
+ init_llist_node(&top_cpuset.attach_node);
cpuset1_init(&top_cpuset);
--
2.54.0