Re: [PATCH -next 5/6] cpuset: separate generate_sched_domains for v1 and v2

From: Waiman Long
Date: Wed Dec 17 2025 - 12:57:02 EST


On 12/17/25 3:49 AM, Chen Ridong wrote:
From: Chen Ridong <chenridong@xxxxxxxxxx>

The generate_sched_domains() function currently handles both v1 and v2
logic. However, the underlying mechanisms for building scheduler domains
differ significantly between the two versions. For cpuset v2, scheduler
domains are straightforwardly derived from valid partitions, whereas
cpuset v1 employs a more complex union-find algorithm to merge overlapping
cpusets. Co-locating these implementations complicates maintenance.

This patch, along with subsequent ones, aims to separate the v1 and v2
logic. For ease of review, this patch first copies the
generate_sched_domains() function into cpuset-v1.c as
cpuset1_generate_sched_domains() and removes v2-specific code. Common
helpers and top_cpuset are declared in cpuset-internal.h. When operating
in v1 mode, the code now calls cpuset1_generate_sched_domains().

Currently there is some code duplication, which will be largely eliminated
once v1-specific code is removed from v2 in the following patch.

Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx>
---
kernel/cgroup/cpuset-internal.h | 24 +++++
kernel/cgroup/cpuset-v1.c | 167 ++++++++++++++++++++++++++++++++
kernel/cgroup/cpuset.c | 31 +-----
3 files changed, 195 insertions(+), 27 deletions(-)

diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 677053ffb913..bd767f8cb0ed 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -9,6 +9,7 @@
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>
+#include <linux/sched/isolation.h>
/* See "Frequency meter" comments, below. */
@@ -185,6 +186,8 @@ struct cpuset {
#endif
};
+extern struct cpuset top_cpuset;
+
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
@@ -242,6 +245,22 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static inline int nr_cpusets(void)
+{
+ assert_cpuset_lock_held();

For a simple helper like this one which only does an atomic_read(), I don't think you need to assert that cpuset_mutex is held.

+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
@@ -298,6 +317,9 @@ void cpuset1_init(struct cpuset *cs);
void cpuset1_online_css(struct cgroup_subsys_state *css);
void update_domain_attr_tree(struct sched_domain_attr *dattr,
struct cpuset *root_cs);
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes);
+
#else
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk) {}
@@ -311,6 +333,8 @@ static inline void cpuset1_init(struct cpuset *cs) {}
static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
struct cpuset *root_cs) {}
+static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes) { return 0; };
#endif /* CONFIG_CPUSETS_V1 */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 95de6f2a4cc5..5c0bded46a7c 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -580,6 +580,173 @@ void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
+/*
+ * cpuset1_generate_sched_domains()
+ *
+ * Finding the best partition (set of domains):
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ */
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* top-down scan of cpusets */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ int nslot_update;
+
+ assert_cpuset_lock_held();
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (root_load_balance) {
+single_root_domain:
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ goto done;
+ }
+
+ csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ if (root_load_balance)
+ csa[csn++] = &top_cpuset;
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+
+ /*
+ * v1:
Remove this v1 line.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
+ continue;
+
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;

This check is v2 specific and you can remove it as well as the "single_root_domain" label.

Cheers,
Longman