Re: [PATCH v10 0/8] cgroup/cpuset: Major cpu partition code restructuring

From: Waiman Long
Date: Wed May 04 2022 - 14:36:33 EST


On 5/4/22 07:28, Michal Koutný wrote:
Hello.

On Tue, May 03, 2022 at 12:21:41PM -0400, Waiman Long <longman@xxxxxxxxxx> wrote:
v10:
- Relax constraints for changes made to "cpuset.cpus"
and "cpuset.cpus.partition" as suggested. Now almost all changes
are allowed.
I see there were also some other changes from v9 (like the first patches
of series).
Any chance you have a public git repo with both versions for a
convenient range-diff?

That is true. Both patches 1 and 2 are new and the changes are pretty straight forward. Patch 1 of v9 has been merged with a latent bug. Patch 4 of this series is a replacement of patch 3 "cgroup/cpuset: Refining features and constraints of a  partition" of v9. The other patches are similar to their versions in v9 with some adjustment based on the different code base.

I don't have a public repo. Attached is the file diff between v9 and v10 in cpuset.c with some other unrelated cpuset patches included.

Cheers,
Longman
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8703a8452c33..90ee0e4d8d7e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/*
* There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -93,16 +93,20 @@ enum prs_errcode {
PERR_INVCPUS,
PERR_INVPARENT,
PERR_NOTPART,
+ PERR_NOTEXCL,
PERR_NOCPUS,
PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
};

static const char * const perr_strings[] = {
- [PERR_INVCPUS] = "Invalid change to cpuset.cpus",
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
[PERR_INVPARENT] = "Parent is an invalid partition root",
[PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
[PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
};

struct cpuset {
@@ -198,23 +202,22 @@ struct cpuset {
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
* 2 - partition root without load balancing (isolated)
- *
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
#define PRS_ISOLATED 2
-#define PRS_ERROR -1
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}

/*
* Temporary cpumasks for working with partitions that are passed among
@@ -294,30 +297,40 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}

+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+static inline void set_partition_invalid(struct cpuset *cs)
+{
+ if (is_partition_valid(cs))
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
/*
* Send notification event of whenever partition_root_state changes.
*/
-static inline void notify_partition_change(struct cpuset *cs,
- int old_prs, int new_prs)
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
- if (old_prs == new_prs)
+ if (old_prs == cs->partition_root_state)
return;
cgroup_file_notify(&cs->partition_file);

/* Reset prs_err if not invalid */
- if (new_prs != PRS_ERROR)
+ if (is_partition_valid(cs))
WRITE_ONCE(cs->prs_err, PERR_NONE);
}

static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
};

/**
@@ -459,7 +472,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
cpuset_for_each_child(child, css, cs) {
if (child == excluded_child)
continue;
- if (is_partition_root(child))
+ if (is_partition_valid(child))
continue;
if (cgroup_is_populated(child->css.cgroup)) {
rcu_read_unlock();
@@ -656,6 +669,35 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}

+/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -680,20 +722,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
-
- /* The checks don't apply to root cpuset */
- if (cur == &top_cpuset)
- return 0;
+ int ret = 0;

rcu_read_lock();
- par = parent_cs(cur);

- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode())
+ ret = validate_change_legacy(cur, trial);
+ if (ret)
+ goto out;
+
+ /* Remaining checks don't apply to root cpuset */
+ if (cur == &top_cpuset)
goto out;

+ par = parent_cs(cur);
+
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
@@ -869,7 +912,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));

goto done;
}
@@ -899,7 +942,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;

if (root_load_balance &&
@@ -911,7 +954,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;

/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -988,7 +1031,7 @@ static int generate_sched_domains(cpumask_var_t **domains,

if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);

@@ -1117,7 +1160,7 @@ static void rebuild_sched_domains_locked(void)
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_root(cs)) {
+ if (!is_partition_valid(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1164,6 +1207,14 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;

+ /*
+ * TODO: With cpuset partition that takes CPUs away from the top
+ * cpuset, we may want to properly adjust the cpus_allowed mask of
+ * tasks in the top cpuset as well.
+ */
+ if (cs == &top_cpuset)
+ return;
+
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
@@ -1203,13 +1254,15 @@ enum subparts_cmd {
partcmd_update, /* Update parent's subparts_cpus */
};

+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cpuset: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
+ * Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1217,43 +1270,32 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
+ * into parent's effective_cpus. 0 will always be returned.
*
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
*
* The partcmd_enable and partcmd_disable commands are used by
* update_prstate(). The partcmd_update command is used by
* update_cpumasks_hier() with newmask NULL and update_cpumask() with
* newmask set.
- *
- * The checking is more strict when enabling partition root than the
- * other two commands.
- *
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
- struct cpuset *parent = parent_cs(cpuset);
+ struct cpuset *parent = parent_cs(cs);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
int old_prs, new_prs;
- bool part_error = false; /* Partition error? */
+ int part_error = PERR_NONE; /* Partition error? */

percpu_rwsem_assert_held(&cpuset_rwsem);

@@ -1262,39 +1304,41 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
-
- /*
- * Enabling partition root is not allowed if there are online children.
- */
- if ((cmd == partcmd_enable) && css_has_online_children(&cpuset->css))
- return -EBUSY;
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if ((newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cs->cpus_allowed)))
+ return PERR_CPUSEMPTY;

adding = deleting = false;
- old_prs = new_prs = cpuset->partition_root_state;
+ old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_enable) {
/*
* Enabling partition root is not allowed if cpus_allowed
* doesn't overlap parent's cpus_allowed.
*/
- if (!cpumask_intersects(cpuset->cpus_allowed, parent->cpus_allowed))
- return -EINVAL;
+ if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ return PERR_INVCPUS;

/*
* A parent can be left with no CPU as long as there is no
* task directly associated with the parent partition.
*/
- if (partition_is_populated(parent, cpuset) &&
- cpumask_subset(parent->effective_cpus, cpuset->cpus_allowed))
- return -EINVAL;
+ if (partition_is_populated(parent, cs) &&
+ !cpumask_intersects(cs->cpus_allowed, parent->effective_cpus))
+ return PERR_NOCPUS;

- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ cpumask_copy(tmp->addmask, cs->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ /*
+ * Need to remove cpus from parent's subparts_cpus for valid
+ * partition root.
+ */
+ deleting = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->delmask, cs->cpus_allowed,
parent->subparts_cpus);
} else if (newmask) {
/*
@@ -1306,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* addmask = newmask & parent->cpus_allowed
* & ~parent->subparts_cpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);

@@ -1317,77 +1361,85 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
- part_error = partition_is_populated(parent, cpuset) &&
- cpumask_subset(parent->effective_cpus, tmp->addmask) &&
- !cpumask_intersects(tmp->delmask, cpu_active_mask);
-
- if ((READ_ONCE(cpuset->prs_err) == PERR_NONE) && part_error)
- WRITE_ONCE(cpuset->prs_err, PERR_INVCPUS);
+ if (adding && partition_is_populated(parent, cs) &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ !cpumask_intersects(tmp->delmask, cpu_active_mask)) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
+ }
} else {
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effective_cpus
- *
- * This gets invoked either due to a hotplug event or
- * from update_cpumasks_hier() where we can't return an
- * error. This can cause a partition root to become invalid
- * in the case of a hotplug.
+ * delmask = cpus_allowed & parent->subparts_cpus
+ * addmask = cpus_allowed & parent->cpus_allowed
+ * & ~parent->subparts_cpus
*
+ * This gets invoked either due to a hotplug event or from
+ * update_cpumasks_hier(). This can cause the state of a
+ * partition root to transition from valid to invalid or vice
+ * versa. So we still need to compute the addmask and delmask.
+
* A partition error happens when:
* 1) Cpuset is valid partition, but parent does not distribute
* out any CPUs.
* 2) Parent has tasks and all its effective CPUs will have
* to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = (is_partition_root(cpuset) &&
- !parent->nr_subparts_cpus) ||
- (cpumask_equal(parent->effective_cpus, tmp->addmask) &&
- partition_is_populated(parent, cpuset));
+ cpumask_and(tmp->addmask, cs->cpus_allowed,
+ parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+ if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+ (adding &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ partition_is_populated(parent, cs))) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ }

- if (is_partition_root(cpuset) && part_error)
- WRITE_ONCE(cpuset->prs_err, PERR_NOCPUS);
+ if (part_error && is_partition_valid(cs) &&
+ parent->nr_subparts_cpus)
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);

if (cmd == partcmd_update) {
/*
- * Check for possible transition between PRS_ERROR and
- * PRS_ENABLED/PRS_ISOLATED.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
case PRS_ISOLATED:
if (part_error)
- new_prs = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
- if (part_error)
- break;
- if (is_sched_load_balance(cpuset))
- new_prs = PRS_ENABLED;
- else
- new_prs = PRS_ISOLATED;
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
+ if (!part_error)
+ new_prs = -old_prs;
break;
}
}

- if ((old_prs == PRS_ERROR) && (new_prs == PRS_ERROR))
- return 0; /* Nothing need to be done */
-
- if (new_prs == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
- }
-
if (!adding && !deleting && (new_prs == old_prs))
return 0;

+ /*
+ * Transitioning from invalid to valid (partcmd_update) may require
+ * setting CS_CPU_EXCLUSIVE and clearing CS_SCHED_LOAD_BALANCE later.
+ */
+ if ((old_prs != new_prs) && is_prs_invalid(old_prs)) {
+ if (!is_cpu_exclusive(cs) &&
+ (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
+ return PERR_NOTEXCL;
+ }
+
/*
* Change the parent's subparts_cpus.
* Newly added CPUs will be removed from effective_cpus and
@@ -1414,12 +1466,25 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);

if (old_prs != new_prs)
- cpuset->partition_root_state = new_prs;
+ cs->partition_root_state = new_prs;

spin_unlock_irq(&callback_lock);
- notify_partition_change(cpuset, old_prs, new_prs);

- return cmd == partcmd_update;
+ if (adding || deleting)
+ update_tasks_cpumask(parent);
+
+ /*
+ * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
+ * rebuild_sched_domains_locked() may be called.
+ */
+ if ((old_prs != new_prs) && (cmd == partcmd_update)) {
+ if (old_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ else if (new_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ }
+ notify_partition_change(cs, old_prs);
+ return 0;
}

/*
@@ -1457,7 +1522,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* out all its CPUs.
*/
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
- if (is_partition_root(cp) &&
+ if (is_partition_valid(cp) &&
cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
goto update_parent_subparts;

@@ -1474,7 +1539,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,

/*
* Skip the whole subtree if the cpumask remains the same
- * with no partition root state and force flag not set.
+ * and has no partition root state and force flag not set.
*/
if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
@@ -1492,21 +1557,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
old_prs = new_prs = cp->partition_root_state;
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
case PRS_ISOLATED:
update_parent = true;
break;

- case PRS_DISABLED:
- case PRS_ERROR:
+ default:
/*
* When parent is not a partition root or is
* invalid, child partition roots become
* invalid too.
*/
- new_prs = PRS_ERROR;
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
WRITE_ONCE(cp->prs_err,
- (parent->partition_root_state == PRS_ERROR)
+ is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART);
break;
}
@@ -1517,25 +1582,25 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
rcu_read_unlock();

if (update_parent) {
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
+ update_parent_subparts_cpumask(cp, partcmd_update, NULL,
+ tmp);
/*
- * The cpuset partition_root_state may be changed
- * to PRS_ERROR. Capture it.
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
*/
new_prs = cp->partition_root_state;
}

spin_lock_irq(&callback_lock);

- if (cp->nr_subparts_cpus && (new_prs <= 0)) {
+ if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
/*
* Put all active subparts_cpus back to effective_cpus.
*/
cpumask_or(tmp->new_cpus, tmp->new_cpus,
cp->subparts_cpus);
cpumask_and(tmp->new_cpus, tmp->new_cpus,
- cpu_active_mask);
+ cpu_active_mask);
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
}
@@ -1552,7 +1617,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,

cp->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
- notify_partition_change(cp, old_prs, new_prs);
+
+ notify_partition_change(cp, old_prs);

WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1568,7 +1634,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
+ is_partition_valid(cp)))
need_rebuild_sched_domains = true;

rcu_read_lock();
@@ -1592,10 +1658,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;

+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1603,8 +1674,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
if (!sibling->use_parent_ecpus)
continue;
+ if (!css_tryget_online(&sibling->css))
+ continue;

+ rcu_read_unlock();
update_cpumasks_hier(sibling, tmp, false);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1662,27 +1738,35 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
#endif

if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
+ update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp);
}

+ compute_effective_cpumask(trialcs->effective_cpus, trialcs,
+ parent_cs(cs));
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);

/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Make sure that subparts_cpus, if not empty, is a subset of
+ * cpus_allowed. Clear subparts_cpus if there is an error or
+ * empty effective cpus with tasks.
*/
if (cs->nr_subparts_cpus) {
- cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ if (cs->prs_err ||
+ (partition_is_populated(cs, NULL) &&
+ cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus))) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ } else {
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
}
spin_unlock_irq(&callback_lock);

+ /* effective_cpus will be updated here */
update_cpumasks_hier(cs, &tmp, false);

if (cs->partition_root_state) {
@@ -2059,16 +2143,17 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
return err;
}

-/*
- * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * new_prs: new partition root state
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
* Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err, old_prs = cs->partition_root_state;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
@@ -2077,28 +2162,33 @@ static int update_prstate(struct cpuset *cs, int new_prs)
return 0;

/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * For a previously invalid partition root, leave it at being
+ * invalid if new_prs is not "member".
*/
- if (new_prs && (old_prs == PRS_ERROR))
- return -EINVAL;
+ if (new_prs && is_prs_invalid(old_prs)) {
+ cs->partition_root_state = -new_prs;
+ return 0;
+ }

if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;

- err = -EINVAL;
if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
+ * cannot be empty.
*/
- if (cpumask_empty(cs->cpus_allowed))
+ if (cpumask_empty(cs->cpus_allowed)) {
+ err = PERR_CPUSEMPTY;
goto out;
+ }

err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
+ if (err) {
+ err = PERR_NOTEXCL;
goto out;
+ }

err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
@@ -2120,16 +2210,16 @@ static int update_prstate(struct cpuset *cs, int new_prs)
* A change in load balance state only, no change in cpumasks.
*/
update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
- err = 0;
+ sched_domain_rebuilt = true;
goto out; /* Sched domain is rebuilt in update_flag() */
} else {
/*
- * Switch back to member is always allowed even if it
+ * Switching back to member is always allowed even if it
* disables child partitions.
*/
- err = 0;
update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
&tmpmask);
+
/*
* If there are child partitions, they will all become invalid.
*/
@@ -2151,12 +2241,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
}

- /*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
- */
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent);

if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
@@ -2164,20 +2249,24 @@ static int update_prstate(struct cpuset *cs, int new_prs)
if (!sched_domain_rebuilt)
rebuild_sched_domains_locked();
out:
- if (!err) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = new_prs;
- spin_unlock_irq(&callback_lock);
- /*
- * Update child cpusets when disabling partition.
- */
- if (new_prs == PRS_DISABLED && !list_empty(&cs->css.children))
- update_cpumasks_hier(cs, &tmpmask, true);
- notify_partition_change(cs, old_prs, new_prs);
- }
+ /*
+ * Make partition invalid if an error happen
+ */
+ if (err)
+ new_prs = -new_prs;
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ /*
+ * Update child cpusets, if present.
+ * Force update if switching back to member.
+ */
+ if (!list_empty(&cs->css.children))
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);

+ notify_partition_change(cs, old_prs);
free_cpumasks(NULL, &tmpmask);
- return err;
+ return 0;
}

/*
@@ -2361,6 +2450,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);

+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);

guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2414,6 +2504,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);

percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}

/* The various types of files and directories in a cpuset file system */
@@ -2669,26 +2760,32 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
- const char *err, *type;
+ const char *err, *type = NULL;

switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
case PRS_ISOLATED:
seq_puts(seq, "isolated\n");
break;
- case PRS_DISABLED:
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- type = is_sched_load_balance(cs) ? "root" : "isolated";
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
err = perr_strings[READ_ONCE(cs->prs_err)];
if (err)
seq_printf(seq, "%s invalid (%s)\n", type, err);
else
seq_printf(seq, "%s invalid\n", type);
break;
+ seq_puts(seq, "isolated invalid\n");
+ break;
}
return 0;
}
@@ -2706,9 +2803,9 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
else if (!strcmp(buf, "isolated"))
val = PRS_ISOLATED;
else
@@ -2960,7 +3057,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -3009,7 +3106,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);

- if (is_partition_root(cs))
+ if (is_partition_valid(cs))
update_prstate(cs, 0);

if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -3157,7 +3254,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,

/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
@@ -3186,7 +3283,7 @@ hotplug_update_tasks(struct cpuset *cs,
bool cpus_updated, bool mems_updated)
{
/* A partition root is allowed to have empty effective cpus */
- if (cpumask_empty(new_cpus) && !is_partition_root(cs))
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -3259,8 +3356,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
* partitions, if present, by setting nr_subparts_cpus to 0 to
* reclaim their cpus.
*/
- if (is_partition_root(cs) && cpumask_empty(&new_cpus) &&
- cs->nr_subparts_cpus && partition_is_populated(cs, NULL)) {
+ if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
+ cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
cpumask_clear(cs->subparts_cpus);
@@ -3271,16 +3368,15 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
/*
* Force the partition to become invalid if either one of
* the following conditions hold:
- * 1) empty effective cpus with tasks in partition
- * 2) parent is invalid or doesn't grant any cpus to child partitions.
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
*/
- if (is_partition_root(cs) &&
- ((cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) ||
- !parent->nr_subparts_cpus)) {
+ if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
+ (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
int old_prs, parent_prs;

- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
if (cs->nr_subparts_cpus) {
spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
@@ -3291,29 +3387,30 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)

old_prs = cs->partition_root_state;
parent_prs = parent->partition_root_state;
- if (old_prs != PRS_ERROR) {
+ if (is_partition_valid(cs)) {
spin_lock_irq(&callback_lock);
- cs->partition_root_state = PRS_ERROR;
+ set_partition_invalid(cs);
spin_unlock_irq(&callback_lock);
- if (parent_prs == PRS_ERROR)
+ if (is_prs_invalid(parent_prs))
WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
else if (!parent_prs)
WRITE_ONCE(cs->prs_err, PERR_NOTPART);
else
WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
- notify_partition_change(cs, old_prs, PRS_ERROR);
+ notify_partition_change(cs, old_prs);
}
cpuset_force_rebuild();
}

/*
- * On the other hand, an erroneous partition root may be transitioned
+ * On the other hand, an invalid partition root may be transitioned
* back to a regular one.
*/
- else if (is_partition_root(parent) &&
- (cs->partition_root_state == PRS_ERROR) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
+ if (is_partition_valid(cs))
+ cpuset_force_rebuild();
+ }

update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
@@ -3619,8 +3716,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}

-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
+/*
+ * __cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -3662,7 +3759,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;

if (in_interrupt())
@@ -3791,8 +3888,8 @@ void cpuset_print_current_mems_allowed(void)

int cpuset_memory_pressure_enabled __read_mostly;

-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@@ -3807,7 +3904,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
- **/
+ */

void __cpuset_memory_pressure_bump(void)
{