[PATCH 23/23] cgroup/cpuset: Documentation and kselftest updates

From: Waiman Long

Date: Mon Apr 20 2026 - 23:15:38 EST


As CPU hotplug is now being used to enable runtime update to the list
of nohz_full and managed_irq CPUs, we should avoid using CPU 0 in the
formation of isolated partition as CPU 0 may not be able to be brought
offline like in the case of x86-64 architecture. So a number of the
test cases in test_cpuset_prs.sh will have to be updated accordingly.

A new test will also be run in offline isn't allowed in CPU 0 to verify
that using CPU 0 as part of an isolated partition will fail.

The cgroup-v2.rst is also updated to reflect the new capability of using
CPU hotplug to enable run time change to the nohz_full and managed_irq
CPU lists.

Since there is a slight performance overhead to enable runtime changes
to nohz_full CPU list, users have to explicitly opt in by adding a
"nohz_ful" kernel command line parameter with or without a CPU list.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
Documentation/admin-guide/cgroup-v2.rst | 35 +++++++---
.../selftests/cgroup/test_cpuset_prs.sh | 70 +++++++++++++++++--
2 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8ad0b2781317..e97fc031eb86 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2604,11 +2604,12 @@ Cpuset Interface Files

It accepts only the following input values when written to.

- ========== =====================================
+ ========== ===============================================
"member" Non-root member of a partition
"root" Partition root
- "isolated" Partition root without load balancing
- ========== =====================================
+ "isolated" Partition root without load balancing and other
+ OS noises
+ ========== ===============================================

A cpuset partition is a collection of cpuset-enabled cgroups with
a partition root at the top of the hierarchy and its descendants
@@ -2652,11 +2653,29 @@ Cpuset Interface Files
partition or scheduling domain. The set of exclusive CPUs is
determined by the value of its "cpuset.cpus.exclusive.effective".

- When set to "isolated", the CPUs in that partition will be in
- an isolated state without any load balancing from the scheduler
- and excluded from the unbound workqueues. Tasks placed in such
- a partition with multiple CPUs should be carefully distributed
- and bound to each of the individual CPUs for optimal performance.
+ When set to "isolated", the CPUs in that partition will be in an
+ isolated state without any load balancing from the scheduler and
+ excluded from the unbound workqueues as well as other OS noises.
+ Tasks placed in such a partition with multiple CPUs should be
+ carefully distributed and bound to each of the individual CPUs
+ for optimal performance.
+
+ As CPU hotplug, if supported, is used to improve the degree of
+ CPU isolation close to the "nohz_full" kernel boot parameter.
+ In some architectures, like x86-64, the boot CPU (typically CPU
+ 0) cannot be brought offline, so the boot CPU should not be used
+ for forming isolated partitions. The "nohz_full" kernel boot
+ parameter needs to be present to enable full dynticks support
+ and RCU no-callback CPU mode for CPUs in isolated partitions
+ even if the optional cpu list isn't provided.
+
+ Using CPU hotplug for creating or destroying an isolated
+ partition can cause latency spike in applications running
+ in other isolated partitions. A reserved list of CPUs can
+ optionally be put in the "nohz_full" kernel boot parameter to
+ alleviate this problem. When these reserved CPUs are used for
+ isolated partitions, CPU hotplug won't need to be invoked and
+ so there won't be latency spike in other isolated partitions.

A partition root ("root" or "isolated") can be in one of the
two possible states - valid or invalid. An invalid partition
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index a56f4153c64d..eebb4122b581 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -67,6 +67,12 @@ then
echo Y > /sys/kernel/debug/sched/verbose
fi

+# Enable dynamic debug message if available
+DYN_DEBUG=/proc/dynamic_debug/control
+[[ -f $DYN_DEBUG ]] && {
+ echo "file kernel/cpu.c +p" > $DYN_DEBUG
+}
+
cd $CGROUP2
echo +cpuset > cgroup.subtree_control

@@ -84,6 +90,15 @@ echo member > test/cpuset.cpus.partition
echo "" > test/cpuset.cpus
[[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!"

+#
+# If nohz_full parameter is specified and nohz_full file exists, CPU hotplug
+# will be used to modify nohz_full cpumask to include all the isolated CPUs
+# in cpuset isolated partitions.
+#
+NOHZ_FULL=/sys/devices/system/cpu/nohz_full
+BOOT_NOHZ_FULL=$(fmt -1 /proc/cmdline | grep "^nohz_full")
+[[ "$BOOT_NOHZ_FULL" = nohz_full ]] && CHK_NOHZ_FULL=1
+
#
# If isolated CPUs have been reserved at boot time (as shown in
# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8
@@ -318,8 +333,8 @@ TEST_MATRIX=(
# Invalid to valid local partition direct transition tests
" C1-3:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3"
" C1-3:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3"
- " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:5-6 A1:P2|B1:P0"
- " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3"
+ " C1-3:P2 . . C4-6 C1-4 . . . 0 A1:1-4|B1:5-6 A1:P2|B1:P0"
+ " C1-3:P2 . . C4-6 C1-4:C1-3 . . . 0 A1:1-3|B1:4-6 A1:P2|B1:P0 1-3"

# Local partition invalidation tests
" C0-3:X1-3:P2 C1-3:X2-3:P2 C2-3:X3:P2 \
@@ -329,8 +344,8 @@ TEST_MATRIX=(
" C0-3:X1-3:P2 C1-3:X2-3:P2 C2-3:X3:P2 \
. . C4:X . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3"
# Local partition CPU change tests
- " C0-5:P2 C4-5:P1 . . . C3-5 . . 0 A1:0-2|A2:3-5 A1:P2|A2:P1 0-2"
- " C0-5:P2 C4-5:P1 . . C1-5 . . . 0 A1:1-3|A2:4-5 A1:P2|A2:P1 1-3"
+ " C1-5:P2 C4-5:P1 . . . C3-5 . . 0 A1:1-2|A2:3-5 A1:P2|A2:P1 1-2"
+ " C1-5:P2 C4-5:P1 . . C2-5 . . . 0 A1:2-3|A2:4-5 A1:P2|A2:P1 2-3"

# cpus_allowed/exclusive_cpus update tests
" C0-3:X2-3 C1-3:X2-3 C2-3:X2-3 \
@@ -442,6 +457,21 @@ TEST_MATRIX=(
" C0-3 . . C4-5 X3-5 . . . 1 A1:0-3|B1:4-5"
)

+#
+# Test matrix to verify that using CPU 0 in isolated (local or remote) partition
+# will fail when offline isn't allowed for CPU 0.
+#
+CPU0_ISOLCPUS_MATRIX=(
+ # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+ # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+ " C0-3 . . C4-5 P2 . . . 0 A1:0-3|B1:4-5 A1:P-2"
+ " C1-3 . . . P2 . . . 0 A1:1-3 A1:P2"
+ " C1-3 . . . P2:C0-3 . . . 0 A1:0-3 A1:P-2"
+ " CX0-3 C0-3 . . . P2 . . 0 A1:0-3|A2:0-3 A2:P-2"
+ " CX0-3 C0-3:X1-3 . . . P2 . . 0 A1:0|A2:1-3 A2:P2"
+ " CX0-3 C0-3:X1-3 . . . P2:X0-3 . . 0 A1:0-3|A2:0-3 A2:P-2"
+)
+
#
# Cpuset controller remote partition test matrix.
#
@@ -513,7 +543,7 @@ write_cpu_online()
}
fi
echo $VAL > $CPUFILE
- pause 0.05
+ pause 0.10
}

#
@@ -654,6 +684,8 @@ dump_states()
[[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)"
[[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)"
done
+ # Dump nohz_full
+ [[ -f $NOHZ_FULL ]] && echo "nohz_full: $(cat $NOHZ_FULL)"
}

#
@@ -789,6 +821,18 @@ check_isolcpus()
EXPECTED_SDOMAIN=$EXPECTED_ISOLCPUS
fi

+ #
+ # Check if nohz_full match cpuset.cpus.isolated if nohz_boot parameter
+ # specified with no parameter.
+ #
+ [[ -f $NOHZ_FULL && "$BOOT_NOHZ_FULL" = nohz_full ]] && {
+ NOHZ_FULL_CPUS=$(cat $NOHZ_FULL)
+ [[ "$ISOLCPUS" != "$NOHZ_FULL_CPUS" ]] && {
+ echo "nohz_full ($NOHZ_FULL_CPUS) does not match cpuset.cpus.isolated ($ISOLCPUS)"
+ return 1
+ }
+ }
+
#
# Appending pre-isolated CPUs
# Even though CPU #8 isn't used for testing, it can't be pre-isolated
@@ -1070,6 +1114,21 @@ run_remote_state_test()
echo "All $I tests of $TEST PASSED."
}

+#
+# Testing CPU 0 isolated partition test when offline is disabled
+#
+run_cpu0_isol_test()
+{
+ # Skip the test if CPU0 offline is allowed or if nohz_full kernel
+ # boot parameter is missing.
+ CPU0_ONLINE=/sys/devices/system/cpu/cpu0/online
+ [[ -f $CPU0_ONLINE ]] && return
+ grep -q -w nohz_full /proc/cmdline
+ [[ $? -ne 0 ]] && return
+
+ run_state_test CPU0_ISOLCPUS_MATRIX
+}
+
#
# Testing the new "isolated" partition root type
#
@@ -1207,6 +1266,7 @@ test_inotify()
trap cleanup 0 2 3 6
run_state_test TEST_MATRIX
run_remote_state_test REMOTE_TEST_MATRIX
+run_cpu0_isol_test
test_isolated
test_inotify
echo "All tests PASSED."
--
2.53.0