Re: [PATCH v11 17/23] x86/resctrl: Auto assign/unassign counters when mbm_cntr_assign is enabled
From: Reinette Chatre
Date: Thu Feb 06 2025 - 13:03:40 EST
Hi Babu,
On 1/22/25 12:20 PM, Babu Moger wrote:
> Assign/unassign counters on resctrl group creation/deletion. Two counters
> are required per group, one for MBM total event and one for MBM local
> event.
>
> There are a limited number of counters available for assignment. If these
> counters are exhausted, the kernel will display the error message: "Out of
> MBM assignable counters". However, it is not necessary to fail the
> creation of a group due to assignment failures. Users have the flexibility
> to modify the assignments at a later time.
>
> Signed-off-by: Babu Moger <babu.moger@xxxxxxx>
> ---
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index b6d188d0f9b7..118b39fbb01e 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -1557,3 +1557,30 @@ int resctrl_unassign_cntr_event(struct rdt_resource *r, struct rdt_mon_domain *d
>
> return ret;
> }
> +
> +void mbm_cntr_reset(struct rdt_resource *r)
> +{
> + u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> + struct rdt_mon_domain *dom;
> +
> + /*
> + * Reset the domain counter configuration. Hardware counters
> + * will reset after switching the monitor mode. So, reset the
> + * architectural amd non-architectural state so that reading
"amd" -> "and"
> + * of hardware counter is not considered as an overflow in the
> + * next update.
> + */
> + if (is_mbm_enabled() && r->mon.mbm_cntr_assignable) {
> + list_for_each_entry(dom, &r->mon_domains, hdr.list) {
> + memset(dom->cntr_cfg, 0,
> + sizeof(*dom->cntr_cfg) * r->mon.num_mbm_cntrs);
> + if (is_mbm_total_enabled())
> + memset(dom->mbm_total, 0,
> + sizeof(struct mbm_state) * idx_limit);
> + if (is_mbm_local_enabled())
> + memset(dom->mbm_local, 0,
> + sizeof(struct mbm_state) * idx_limit);
> + resctrl_arch_reset_rmid_all(r, dom);
> + }
> + }
> +}
I looked back at the previous versions to better understand how this function
came about and I do not think it actually solves the problem it aims to solve.
rdtgroup_unassign_cntrs() can fail and when it does the counter is not free'd. That
leaves a monitoring domain's array with an entry that points to a resource group
that no longer exists (unless it is the default resource group) since
rdtgroup_unassign_cntrs() does not check the return and proceeds to remove the
resource group. mbm_cntr_reset() is called on umount of resctrl but
rdtgroup_unassign_cntrs() is called on every group remove and those scenarios
are not handled.
To address this I believe that I need to go back on a previous request to have
resctrl_arch_config_cntr() return an error code. AMD does not need this and
it is difficult to predict what will work for MPAM. I originally wanted to be
flexible here but this appears to be impractical. With a new requirement that
resctrl_arch_config_cntr() always succeeds the counter will in turn always
be free'd and not leave dangling pointers. I believe doing so eliminates
the need for mbm_cntr_reset() as used in this patch. My apologies for the
misdirection. We can re-evaluate these flows if MPAM needs anything different.
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 2b86124c336b..f61f0cd032ef 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -2687,6 +2687,46 @@ static void schemata_list_destroy(void)
> }
> }
>
> +/*
> + * Called when a new group is created. If "mbm_cntr_assign" mode is enabled,
> + * counters are automatically assigned. Each group can accommodate two counters:
> + * one for the total event and one for the local event. Assignments may fail
> + * due to the limited number of counters. However, it is not necessary to fail
> + * the group creation and thus no failure is returned. Users have the option
> + * to modify the counter assignments after the group has been created.
> + */
> +static void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp)
> +{
> + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> +
> + if (!resctrl_arch_mbm_cntr_assign_enabled(r))
> + return;
> +
> + if (is_mbm_total_enabled())
> + resctrl_assign_cntr_event(r, NULL, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID);
> +
> + if (is_mbm_local_enabled())
> + resctrl_assign_cntr_event(r, NULL, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID);
> +}
> +
> +/*
> + * Called when a group is deleted. Counters are unassigned if it was in
> + * assigned state.
> + */
> +static void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp)
> +{
> + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> +
> + if (!resctrl_arch_mbm_cntr_assign_enabled(r))
> + return;
> +
It looks to me as though there are a couple of places (rmdir_all_sub(), rdt_kill_sb(),
and rdtgroup_rmdir_ctrl()) where rdtgroup_unassign_cntrs() could be called on a system that
does not support monitoring and/or only supports cache allocation monitoring.
In these paths it is only the architecture's resctrl_arch_mbm_cntr_assign_enabled(r) that
gates the resctrl flow. I think rdtgroup_unassign_cntrs() and to match rdtgroup_assign_cntrs()
can do with at least a r->mon_capable check.
> + if (is_mbm_total_enabled())
> + resctrl_unassign_cntr_event(r, NULL, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID);
> +
> + if (is_mbm_local_enabled())
> + resctrl_unassign_cntr_event(r, NULL, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID);
> +}
> +
> static int rdt_get_tree(struct fs_context *fc)
> {
> struct rdt_fs_context *ctx = rdt_fc2context(fc);
> @@ -2741,6 +2781,8 @@ static int rdt_get_tree(struct fs_context *fc)
> if (ret < 0)
> goto out_info;
>
> + rdtgroup_assign_cntrs(&rdtgroup_default);
> +
> ret = mkdir_mondata_all(rdtgroup_default.kn,
> &rdtgroup_default, &kn_mondata);
> if (ret < 0)
> @@ -2779,8 +2821,10 @@ static int rdt_get_tree(struct fs_context *fc)
> if (resctrl_arch_mon_capable())
> kernfs_remove(kn_mondata);
> out_mongrp:
> - if (resctrl_arch_mon_capable())
> + if (resctrl_arch_mon_capable()) {
> + rdtgroup_unassign_cntrs(&rdtgroup_default);
> kernfs_remove(kn_mongrp);
> + }
> out_info:
> kernfs_remove(kn_info);
> out_schemata_free:
> @@ -2956,6 +3000,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
>
> head = &rdtgrp->mon.crdtgrp_list;
> list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
> + rdtgroup_unassign_cntrs(sentry);
> free_rmid(sentry->closid, sentry->mon.rmid);
> list_del(&sentry->mon.crdtgrp_list);
>
> @@ -2996,6 +3041,8 @@ static void rmdir_all_sub(void)
> cpumask_or(&rdtgroup_default.cpu_mask,
> &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
>
> + rdtgroup_unassign_cntrs(rdtgrp);
> +
> free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
>
> kernfs_remove(rdtgrp->kn);
> @@ -3027,6 +3074,8 @@ static void rdt_kill_sb(struct super_block *sb)
> for_each_alloc_capable_rdt_resource(r)
> reset_all_ctrls(r);
> rmdir_all_sub();
> + rdtgroup_unassign_cntrs(&rdtgroup_default);
> + mbm_cntr_reset(&rdt_resources_all[RDT_RESOURCE_L3].r_resctrl);
> rdt_pseudo_lock_release();
> rdtgroup_default.mode = RDT_MODE_SHAREABLE;
> schemata_list_destroy();
> @@ -3490,9 +3539,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
> }
> rdtgrp->mon.rmid = ret;
>
> + rdtgroup_assign_cntrs(rdtgrp);
> +
> ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
> if (ret) {
> rdt_last_cmd_puts("kernfs subdir error\n");
> + rdtgroup_unassign_cntrs(rdtgrp);
> free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> return ret;
> }
> @@ -3502,8 +3554,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
>
> static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
> {
> - if (resctrl_arch_mon_capable())
> + if (resctrl_arch_mon_capable()) {
> + rdtgroup_unassign_cntrs(rgrp);
> free_rmid(rgrp->closid, rgrp->mon.rmid);
> + }
> }
>
> static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
> @@ -3764,6 +3818,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> update_closid_rmid(tmpmask, NULL);
>
> rdtgrp->flags = RDT_DELETED;
> +
> + rdtgroup_unassign_cntrs(rdtgrp);
> +
> free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
>
> /*
> @@ -3810,6 +3867,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> update_closid_rmid(tmpmask, NULL);
>
> + rdtgroup_unassign_cntrs(rdtgrp);
> +
> free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> closid_free(rdtgrp->closid);
>
Reinette