[PATCH 3/4] fs/resctrl: make a failed kernel-mode switch a no-op

From: Qinyun Tan

Date: Thu Jun 11 2026 - 07:21:49 EST


resctrl_kernel_mode_write() used to tear down the previous kernel-mode
(PLZA) binding via rdtgroup_config_kmode_clear() before programming the
new one with rdtgroup_config_kmode(). When programming the new binding
failed (e.g. -ENOMEM on cpumask allocation, or -EINVAL for a
pseudo-locked target group), the old binding had already been disabled
in hardware (PLZA_EN cleared on its CPUs, the old group's kmode
bookkeeping reset and its kmode_cpus files hidden).

This is unacceptable from a user's point of view: a user who merely
tries to switch an already-working kernel_mode configuration to a new
one ends up losing the original configuration as well once the new
switch fails. A failed mode switch must be a no-op and leave the
existing binding untouched.

Make the switch atomic by performing all fallible work up front, before
the old binding is released:

- Move the pseudo-locked group check into resctrl_kernel_mode_write()
so it runs before the old binding is torn down.
- Pre-allocate the enable/disable cpumasks in the writer and pass them
to rdtgroup_config_kmode(), which then can no longer fail and is
turned into a void function.
- Turn rdtgroup_config_kmode_clear() into a void function as well: the
arch hook resctrl_arch_configure_kmode() takes a const cpumask and
the CPUs to disable are always a ready-made set (the group's
kmode_cpu_mask, or cpu_online_mask when it is empty), so the
temporary cpumask it used to allocate is unnecessary and is removed,
eliminating its only failure point.

After this change rdtgroup_config_kmode_clear() (release of the old
binding) is only reached once everything needed to program the new
binding is already in hand, so the subsequent programming step cannot
fail. On any error the function returns with hardware and resctrl_kcfg
left exactly as they were, preserving the user's existing configuration.

Signed-off-by: Qinyun Tan <qinyuntan@xxxxxxxxxxxxxxxxx>
Reviewed-by: Xunlei Pang <xlpang@xxxxxxxxxxxxxxxxx>
---
fs/resctrl/rdtgroup.c | 132 +++++++++++++++++++++---------------------
1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 1d9bd34eb9cdd..cef6ab75c2b57 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -1315,6 +1315,11 @@ static void resctrl_kmode_files_set_visible(struct rdtgroup *rdtgrp, bool visibl
* the RMID_EN setting: BIT(GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU)
* sets RMID_EN so CPL=0 uses the PLZA RMID; other modes clear it
* so CPL=0 inherits the user process's RMID from PQR_ASSOC.
+ * @enable_mask: Caller-provided scratch cpumask used to compute the set of
+ * CPUs to enable. Pre-allocated by the caller so this function
+ * cannot fail after the previous binding has been torn down.
+ * @disable_mask: Caller-provided scratch cpumask used to compute the set of
+ * CPUs to disable. Same rationale as @enable_mask.
*
* Derives CLOSID/RMID from @rdtgrp->type:
* - RDTMON_GROUP: parent control group's CLOSID with the monitor group's RMID.
@@ -1326,30 +1331,22 @@ static void resctrl_kmode_files_set_visible(struct rdtgroup *rdtgrp, bool visibl
* stale enable bits from a previously bound group are cleared in the same
* reprogram step.
*
- * Context: Caller must hold rdtgroup_mutex.
+ * All fallible work (pseudo-lock validation and cpumask allocation) is done
+ * by the caller before the previous binding is released, so this function
+ * never fails and is therefore void: a failed mode switch can leave the
+ * existing configuration untouched.
*
- * Return: 0 on success, -EINVAL for a pseudo-locked group, -ENOMEM if
- * cpumask allocation fails.
+ * Context: Caller must hold rdtgroup_mutex. @rdtgrp must not be
+ * pseudo-locked (validated by the caller).
*/
-static int rdtgroup_config_kmode(struct rdtgroup *rdtgrp, u32 kmode)
+static void rdtgroup_config_kmode(struct rdtgroup *rdtgrp, u32 kmode,
+ struct cpumask *enable_mask,
+ struct cpumask *disable_mask)
{
- cpumask_var_t enable_mask, disable_mask;
u32 closid, rmid;
bool need_disable, assign_rmid;

- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- rdt_last_cmd_puts("Resource group is pseudo-locked\n");
- return -EINVAL;
- }
-
- if (!zalloc_cpumask_var(&enable_mask, GFP_KERNEL))
- return -ENOMEM;
-
need_disable = !cpumask_empty(&rdtgrp->kmode_cpu_mask);
- if (need_disable && !zalloc_cpumask_var(&disable_mask, GFP_KERNEL)) {
- free_cpumask_var(enable_mask);
- return -ENOMEM;
- }

if (rdtgrp->type == RDTMON_GROUP) {
closid = rdtgrp->mon.parent->closid;
@@ -1383,12 +1380,6 @@ static int rdtgroup_config_kmode(struct rdtgroup *rdtgrp, u32 kmode)

rdtgrp->kmode = true;
resctrl_kmode_files_set_visible(rdtgrp, true);
-
- free_cpumask_var(enable_mask);
- if (need_disable)
- free_cpumask_var(disable_mask);
-
- return 0;
}

/**
@@ -1409,30 +1400,24 @@ static int rdtgroup_config_kmode(struct rdtgroup *rdtgrp, u32 kmode)
* skips rdtgroup_config_kmode() entirely -- still tears down the previously
* bound group instead of leaving stale enable bits behind.
*
- * On allocation failure the function returns -ENOMEM and leaves both the
- * hardware state and @rdtgrp's bookkeeping unchanged so the caller can fail
- * the operation atomically and last_cmd_status reflects reality.
+ * The arch hook reads the CPU mask without modifying it, so the CPUs to
+ * disable are passed directly (the group's @kmode_cpu_mask, or cpu_online_mask
+ * when that mask is empty) without needing a temporary cpumask. The function
+ * therefore cannot fail.
*
* Context: Caller must hold rdtgroup_mutex.
- *
- * Return: 0 on success (including the @rdtgrp == %NULL and INHERIT cases),
- * -ENOMEM if cpumask allocation fails.
*/
-static int rdtgroup_config_kmode_clear(struct rdtgroup *rdtgrp, int kmode)
+static void rdtgroup_config_kmode_clear(struct rdtgroup *rdtgrp, int kmode)
{
- cpumask_var_t disable_mask;
bool assign_rmid;
u32 closid, rmid;

if (!rdtgrp)
- return 0;
+ return;

if (kmode == BIT(INHERIT_CTRL_AND_MON))
goto out_clear;

- if (!zalloc_cpumask_var(&disable_mask, GFP_KERNEL))
- return -ENOMEM;
-
if (rdtgrp->type == RDTMON_GROUP) {
closid = rdtgrp->mon.parent->closid;
rmid = rdtgrp->mon.rmid;
@@ -1443,20 +1428,14 @@ static int rdtgroup_config_kmode_clear(struct rdtgroup *rdtgrp, int kmode)

assign_rmid = (kmode == BIT(GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU));

- if (cpumask_empty(&rdtgrp->kmode_cpu_mask))
- cpumask_copy(disable_mask, cpu_online_mask);
- else
- cpumask_copy(disable_mask, &rdtgrp->kmode_cpu_mask);
-
- resctrl_arch_configure_kmode(disable_mask, closid, rmid,
- assign_rmid, false);
- free_cpumask_var(disable_mask);
+ resctrl_arch_configure_kmode(cpumask_empty(&rdtgrp->kmode_cpu_mask) ?
+ cpu_online_mask : &rdtgrp->kmode_cpu_mask,
+ closid, rmid, assign_rmid, false);

out_clear:
cpumask_clear(&rdtgrp->kmode_cpu_mask);
resctrl_kmode_files_set_visible(rdtgrp, false);
rdtgrp->kmode = false;
- return 0;
}

/**
@@ -1472,11 +1451,6 @@ static int rdtgroup_config_kmode_clear(struct rdtgroup *rdtgrp, int kmode)
* (&rdtgroup_default, BIT(INHERIT_CTRL_AND_MON)) so subsequent show/write
* paths do not dereference @rdtgrp after the caller frees it.
*
- * If the underlying tear-down fails (cpumask allocation), the snapshot is
- * still reset because @rdtgrp is about to disappear; stale enable bits on
- * those CPUs are reported via pr_warn() and will be cleared by the next
- * non-INHERIT reprogram.
- *
* Context: Caller must hold rdtgroup_mutex.
*/
static void rdtgroup_config_kmode_delete(struct rdtgroup *rdtgrp)
@@ -1484,8 +1458,7 @@ static void rdtgroup_config_kmode_delete(struct rdtgroup *rdtgrp)
if (!rdtgrp || !rdtgrp->kmode)
return;

- if (rdtgroup_config_kmode_clear(rdtgrp, resctrl_kcfg.kmode_cur))
- pr_warn("resctrl: kernel-mode disable failed; stale enable bits may persist\n");
+ rdtgroup_config_kmode_clear(rdtgrp, resctrl_kcfg.kmode_cur);

if (resctrl_kcfg.k_rdtgrp == rdtgrp) {
resctrl_kcfg.k_rdtgrp = &rdtgroup_default;
@@ -1565,12 +1538,15 @@ static struct rdtgroup *rdtgroup_by_kmode_path(const char *ctrl_name,
* Parses @buf, validates that <mode> is listed in resctrl_mode_str[] and is
* supported by the platform (resctrl_kcfg.kmode), resolves <ctrl>/<mon>/ to
* an existing rdtgroup (or picks &rdtgroup_default if no group was specified
- * or if the new mode is INHERIT), clears any previous binding via
- * rdtgroup_config_kmode_clear(), programs hardware via
+ * or if the new mode is INHERIT). All fallible validation and cpumask
+ * allocation are performed before the previous binding is released via
+ * rdtgroup_config_kmode_clear(); hardware is then programmed via
* rdtgroup_config_kmode() when @kmode is not BIT(INHERIT_CTRL_AND_MON), and
- * on success updates resctrl_kcfg.k_rdtgrp and resctrl_kcfg.kmode_cur. The
- * display-only "group=none" form produced by show for inactive modes is
- * rejected. Errors are reported in last_cmd_status.
+ * resctrl_kcfg.k_rdtgrp / resctrl_kcfg.kmode_cur are updated. Because every
+ * failure is detected up front, a failed switch is a no-op that leaves the
+ * user's existing configuration intact. The display-only "group=none" form
+ * produced by show for inactive modes is rejected. Errors are reported in
+ * last_cmd_status.
*
* Return: @nbytes on success, negative errno with last_cmd_status set on error.
*/
@@ -1580,6 +1556,8 @@ static ssize_t resctrl_kernel_mode_write(struct kernfs_open_file *of,
char *mode_str, *group_str, *slash;
const char *ctrl_name, *mon_name;
struct rdtgroup *rdtgrp;
+ cpumask_var_t enable_mask, disable_mask;
+ bool masks_allocated = false;
int ret = 0;
size_t len;
u32 kmode;
@@ -1696,25 +1674,47 @@ static ssize_t resctrl_kernel_mode_write(struct kernfs_open_file *of,
goto out_unlock;
}

- /* Release the old binding first. */
- ret = rdtgroup_config_kmode_clear(resctrl_kcfg.k_rdtgrp, resctrl_kcfg.kmode_cur);
- if (ret) {
- rdt_last_cmd_puts("Failed to release previous kernel-mode binding\n");
- goto out_unlock;
- }
-
+ /*
+ * Validate and acquire everything that can fail BEFORE the old
+ * binding is released, so a failed mode switch is a true no-op and
+ * the user's existing (working) configuration is left intact.
+ */
if (kmode != BIT(INHERIT_CTRL_AND_MON)) {
- ret = rdtgroup_config_kmode(rdtgrp, kmode);
- if (ret) {
- rdt_last_cmd_puts("Kernel mode change failed\n");
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ rdt_last_cmd_puts("Resource group is pseudo-locked\n");
+ ret = -EINVAL;
goto out_unlock;
}
+ if (!zalloc_cpumask_var(&enable_mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ if (!zalloc_cpumask_var(&disable_mask, GFP_KERNEL)) {
+ free_cpumask_var(enable_mask);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ masks_allocated = true;
}

+ /*
+ * Past this point nothing can fail: the old binding is released (a
+ * void operation) and the new one is programmed with the pre-allocated
+ * masks, so the switch is atomic.
+ */
+ rdtgroup_config_kmode_clear(resctrl_kcfg.k_rdtgrp, resctrl_kcfg.kmode_cur);
+
+ if (masks_allocated)
+ rdtgroup_config_kmode(rdtgrp, kmode, enable_mask, disable_mask);
+
resctrl_kcfg.k_rdtgrp = rdtgrp;
resctrl_kcfg.kmode_cur = kmode;

out_unlock:
+ if (masks_allocated) {
+ free_cpumask_var(enable_mask);
+ free_cpumask_var(disable_mask);
+ }
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
return ret ?: nbytes;
--
2.43.7