Re: [RFC PATCH 13/19] x86/resctrl: Add PLZA state tracking and context switch handling
From: Luck, Tony
Date: Tue Feb 17 2026 - 16:45:31 EST
> >>> I'm not sure if this would happen in the real world or not.
> >>
> >> Ack. I would like to echo Tony's request for feedback from resctrl users
> >> https://lore.kernel.org/lkml/aYzcpuG0PfUaTdqt@agluck-desk3/
> >
> > Indeed. This is all getting a bit complicated.
> >
>
> ack
We have several proposals so far:
1) Ben's suggestion to use the default group (either with a Babu-style
"plza" file just in that group, or a configuration file under "info/").
This is easily the simplest for implementation, but has no flexibility.
Also requires users to move all the non-critical workloads out to other
CTRL_MON groups. Doesn't steal a CLOSID/RMID.
2) My thoughts are for a separate group that is only used to configure
the schemata. This does allocate a dedicated CLOSID/RMID pair. Those
are used for all tasks when in kernel mode.
No context switch overhead. Has some flexibility.
3) Babu's RFC patch. Designates an existing CTRL_MON group as the one
that defines kernel CLOSID/RMID. Tasks and CPUs can be assigned to this
group in addition to belonging to another group than defines schemata
resources when running in non-kernel mode.
Tasks aren't required to be in the kernel group, in which case they
keep the same CLOSID in both user and kernel mode. When used in this
way there will be context switch overhead when changing between tasks
with different kernel CLOSID/RMID.
4) Even more complex scenarios with more than one user configurable
kernel group to give more options on resources available in the kernel.
I had a quick pass as coding my option "2". My UI to designate the
group to use for kernel mode is to reserve the name "kernel_group"
when making CTRL_MON groups. Some tweaks to avoid creating the
"tasks", "cpus", and "cpus_list" files (which might be done more
elegantly), and "mon_groups" directory in this group.
I just have stubs in the arch/x86 core.c file for enumeration and
enable/disable. Just realized I'm missing a call to disable on
unmount of the resctrl file system.
Apart from umount, I think it is more or less complete, and fairly
compact:
arch/x86/kernel/cpu/resctrl/core.c | 25 +++++++++++++++++++++++++
fs/resctrl/internal.h | 9 +++++++--
fs/resctrl/rdtgroup.c | 49 ++++++++++++++++++++++++++++++++++++-------------
include/linux/resctrl.h | 4 ++++
4 files changed, 72 insertions(+), 15 deletions(-)
-Tony
---
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 006e57fd7ca5..540ab9d7621a 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -702,6 +702,10 @@ bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r);
extern unsigned int resctrl_rmid_realloc_threshold;
extern unsigned int resctrl_rmid_realloc_limit;
+bool resctrl_arch_kernel_group_is_supported(void);
+void resctrl_arch_kernel_group_enable(u32 closid, u32 rmid);
+void resctrl_arch_kernel_group_disable(void);
+
int resctrl_init(void);
void resctrl_exit(void);
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 1a9b29119f88..99fbdcaf3c63 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -156,6 +156,7 @@ extern bool resctrl_mounted;
enum rdt_group_type {
RDTCTRL_GROUP = 0,
RDTMON_GROUP,
+ RDTKERNEL_GROUP,
RDT_NUM_GROUP,
};
@@ -245,6 +246,8 @@ struct rdtgroup {
#define RFTYPE_BASE BIT(1)
+#define RFTYPE_TASKS_CPUS BIT(2)
+
#define RFTYPE_CTRL BIT(4)
#define RFTYPE_MON BIT(5)
@@ -267,9 +270,11 @@ struct rdtgroup {
#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
-#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_TASKS_CPUS | RFTYPE_CTRL)
+
+#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_TASKS_CPUS | RFTYPE_MON)
-#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON)
+#define RFTYPE_KERNEL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
/* List of all resource groups */
extern struct list_head rdt_all_groups;
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 7667cf7c4e94..94d20b200e47 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -733,6 +733,28 @@ static void clear_closid_rmid(int cpu)
RESCTRL_RESERVED_CLOSID);
}
+static bool kernel_group_is_enabled;
+static u32 kernel_group_closid, kernel_group_rmid;
+
+bool resctrl_arch_kernel_group_is_supported(void)
+{
+ return true;
+}
+
+void resctrl_arch_kernel_group_enable(u32 closid, u32 rmid)
+{
+ pr_info("Enable kernel group on all CPUs here closid=%u rmid=%u\n", closid, rmid);
+ kernel_group_closid = closid;
+ kernel_group_rmid = rmid;
+ kernel_group_is_enabled = true;
+}
+
+void resctrl_arch_kernel_group_disable(void)
+{
+ pr_info("Disable kernel group on all CPUs here\n");
+ kernel_group_is_enabled = false;
+}
+
static int resctrl_arch_online_cpu(unsigned int cpu)
{
struct rdt_resource *r;
@@ -743,6 +765,9 @@ static int resctrl_arch_online_cpu(unsigned int cpu)
mutex_unlock(&domain_list_lock);
clear_closid_rmid(cpu);
+ if (kernel_group_is_enabled)
+ pr_info("Enable kernel group on CPU:%d closid=%u rmid=%u\n",
+ cpu, kernel_group_closid, kernel_group_rmid);
resctrl_online_cpu(cpu);
return 0;
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index ba8d503551cd..0d396569a76a 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -2046,7 +2046,7 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_cpus_write,
.seq_show = rdtgroup_cpus_show,
- .fflags = RFTYPE_BASE,
+ .fflags = RFTYPE_BASE | RFTYPE_TASKS_CPUS,
},
{
.name = "cpus_list",
@@ -2055,7 +2055,7 @@ static struct rftype res_common_files[] = {
.write = rdtgroup_cpus_write,
.seq_show = rdtgroup_cpus_show,
.flags = RFTYPE_FLAGS_CPUS_LIST,
- .fflags = RFTYPE_BASE,
+ .fflags = RFTYPE_BASE | RFTYPE_TASKS_CPUS,
},
{
.name = "tasks",
@@ -2063,14 +2063,14 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_tasks_write,
.seq_show = rdtgroup_tasks_show,
- .fflags = RFTYPE_BASE,
+ .fflags = RFTYPE_BASE | RFTYPE_TASKS_CPUS,
},
{
.name = "mon_hw_id",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdtgroup_rmid_show,
- .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
+ .fflags = RFTYPE_BASE | RFTYPE_MON | RFTYPE_DEBUG,
},
{
.name = "schemata",
@@ -2078,7 +2078,7 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_schemata_write,
.seq_show = rdtgroup_schemata_show,
- .fflags = RFTYPE_CTRL_BASE,
+ .fflags = RFTYPE_BASE | RFTYPE_CTRL,
},
{
.name = "mba_MBps_event",
@@ -2093,14 +2093,14 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_mode_write,
.seq_show = rdtgroup_mode_show,
- .fflags = RFTYPE_CTRL_BASE,
+ .fflags = RFTYPE_BASE | RFTYPE_CTRL,
},
{
.name = "size",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdtgroup_size_show,
- .fflags = RFTYPE_CTRL_BASE,
+ .fflags = RFTYPE_BASE | RFTYPE_CTRL,
},
{
.name = "sparse_masks",
@@ -2114,7 +2114,7 @@ static struct rftype res_common_files[] = {
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdtgroup_closid_show,
- .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
+ .fflags = RFTYPE_BASE | RFTYPE_CTRL | RFTYPE_DEBUG,
},
};
@@ -3788,11 +3788,15 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
}
if (rtype == RDTCTRL_GROUP) {
- files = RFTYPE_BASE | RFTYPE_CTRL;
+ files = RFTYPE_CTRL_BASE;
+ if (resctrl_arch_mon_capable())
+ files |= RFTYPE_MON_BASE;
+ } else if (rtype == RDTKERNEL_GROUP) {
+ files = RFTYPE_KERNEL_BASE;
if (resctrl_arch_mon_capable())
files |= RFTYPE_MON;
} else {
- files = RFTYPE_BASE | RFTYPE_MON;
+ files = RFTYPE_MON_BASE;
}
ret = rdtgroup_add_files(kn, files);
@@ -3866,12 +3870,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
const char *name, umode_t mode)
{
+ enum rdt_group_type rtype = RDTCTRL_GROUP;
struct rdtgroup *rdtgrp;
struct kernfs_node *kn;
u32 closid;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
+ if (!strcmp(name, "kernel_group")) {
+ if (!resctrl_arch_kernel_group_is_supported()) {
+ rdt_last_cmd_puts("No support for kernel group\n");
+ return -EINVAL;
+ }
+ rtype = RDTKERNEL_GROUP;
+ }
+
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, rtype, &rdtgrp);
if (ret)
return ret;
@@ -3898,7 +3911,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
- if (resctrl_arch_mon_capable()) {
+ if (rtype == RDTCTRL_GROUP && resctrl_arch_mon_capable()) {
/*
* Create an empty mon_groups directory to hold the subset
* of tasks and cpus to monitor.
@@ -3912,6 +3925,9 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
rdtgrp->mba_mbps_event = mba_mbps_default_event;
}
+ if (rtype == RDTKERNEL_GROUP)
+ resctrl_arch_kernel_group_enable(rdtgrp->closid, rdtgrp->mon.rmid);
+
goto out_unlock;
out_del_list:
@@ -4005,6 +4021,11 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
u32 closid, rmid;
int cpu;
+ if (rdtgrp->type == RDTKERNEL_GROUP) {
+ resctrl_arch_kernel_group_disable();
+ goto skip_tasks_and_cpus;
+ }
+
/* Give any tasks back to the default group */
rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
@@ -4025,6 +4046,7 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
update_closid_rmid(tmpmask, NULL);
+skip_tasks_and_cpus:
rdtgroup_unassign_cntrs(rdtgrp);
free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
@@ -4073,7 +4095,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
* If the rdtgroup is a mon group and parent directory
* is a valid "mon_groups" directory, remove the mon group.
*/
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
+ if ((rdtgrp->type == RDTCTRL_GROUP || rdtgrp->type == RDTKERNEL_GROUP) &&
+ parent_kn == rdtgroup_default.kn &&
rdtgrp != &rdtgroup_default) {
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {