[PATCH RFC v5 15/18] riscv_cbqri: resctrl: Add mbm_total_bytes bandwidth monitoring
From: Drew Fustini
Date: Sun May 24 2026 - 19:58:58 EST
Expose CBQRI bandwidth controller's combined read+write counter as
the L3 mbm_total_bytes event. A software accumulator keeps the
64-bit byte total monotonic across the 62-bit hardware counter wrap.
mbm_local_bytes is not supported because the CBQRI spec has no way
to distinguish total versus local. mbm_total_bytes is enabled only
when the platform exposes exactly one mon-capable bandwidth
controller and exactly one L3 domain. Pairing a single BC with
multiple L3 domains would let standard userspace tools overcount
system bandwidth by summing the same counter across domains.
Assisted-by: Claude:claude-opus-4-7
Co-developed-by: Adrien Ricciardi <aricciardi@xxxxxxxxxxxx>
Signed-off-by: Adrien Ricciardi <aricciardi@xxxxxxxxxxxx>
Signed-off-by: Drew Fustini <fustini@xxxxxxxxxx>
---
drivers/resctrl/cbqri_resctrl.c | 191 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 187 insertions(+), 4 deletions(-)
diff --git a/drivers/resctrl/cbqri_resctrl.c b/drivers/resctrl/cbqri_resctrl.c
index ba764bc6ef17..f11709d7e479 100644
--- a/drivers/resctrl/cbqri_resctrl.c
+++ b/drivers/resctrl/cbqri_resctrl.c
@@ -29,6 +29,13 @@ struct cbqri_resctrl_res {
struct cbqri_resctrl_dom {
struct rdt_ctrl_domain resctrl_ctrl_dom;
struct cbqri_controller *hw_ctrl;
+ /*
+ * For an L3 capacity controller paired with a bandwidth controller
+ * of matching topology, paired_bc caches that BC so mbm_total_bytes
+ * reads / resets don't have to walk cbqri_controllers on every hit.
+ * NULL for non-L3 domains and L3s without a paired BC.
+ */
+ struct cbqri_controller *paired_bc;
};
static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES];
@@ -37,7 +44,7 @@ static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES];
* Per-event controller table. Only events CBQRI can back occupy a
* slot, so other events do not bloat the array.
*/
-#define CBQRI_MAX_EVENT QOS_L3_OCCUP_EVENT_ID
+#define CBQRI_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID
static struct cbqri_controller *cbqri_resctrl_counters[CBQRI_MAX_EVENT + 1];
static bool exposed_alloc_capable;
@@ -228,6 +235,36 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d
mutex_unlock(&ctrl->lock);
break;
+ case QOS_L3_MBM_TOTAL_EVENT_ID: {
+ struct cbqri_controller *bc;
+
+ cd = cbqri_find_ctrl_domain(&r->ctrl_domains, d->hdr.id);
+ if (!cd)
+ break;
+ hw_dom = container_of(cd, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+ bc = hw_dom->paired_bc;
+ if (!bc)
+ break;
+ if (WARN_ON_ONCE(!bc->mbm_total_states))
+ break;
+ if (rmid >= bc->mcid_count)
+ break;
+
+ mutex_lock(&bc->lock);
+ /*
+ * CONFIG_EVENT both resets and re-arms. Skip the accumulator
+ * memset on failure. A stale hardware counter X with
+ * prev_ctr=0 would inject overflow(0, X) on the next read.
+ */
+ if (!cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+ CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid,
+ CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL))
+ memset(&bc->mbm_total_states[rmid], 0,
+ sizeof(*bc->mbm_total_states));
+ mutex_unlock(&bc->lock);
+ break;
+ }
+
default:
break;
}
@@ -240,8 +277,10 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai
int i;
/* Bound by max_rmid (system-wide minimum mcid_count). */
- for (i = 0; i < max_rmid; i++)
+ for (i = 0; i < max_rmid; i++) {
resctrl_arch_reset_rmid(r, d, 0, i, QOS_L3_OCCUP_EVENT_ID);
+ resctrl_arch_reset_rmid(r, d, 0, i, QOS_L3_MBM_TOTAL_EVENT_ID);
+ }
}
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
@@ -305,6 +344,82 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
mutex_unlock(&ctrl->lock);
break;
+ case QOS_L3_MBM_TOTAL_EVENT_ID: {
+ struct cbqri_controller *bc;
+
+ /*
+ * The L3 monitoring domain's id is the L3 cache id. The
+ * matching ctrl domain's hw_dom->paired_bc was cached at
+ * add time to avoid walking cbqri_controllers on every read.
+ */
+ d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id);
+ if (!d) {
+ err = -ENOENT;
+ break;
+ }
+ hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+ bc = hw_dom->paired_bc;
+ if (!bc) {
+ err = -ENOENT;
+ break;
+ }
+ if (WARN_ON_ONCE(!bc->mbm_total_states)) {
+ err = -EIO;
+ break;
+ }
+ if (rmid >= bc->mcid_count) {
+ err = -ERANGE;
+ break;
+ }
+
+ mutex_lock(&bc->lock);
+ /* Pass EVT_ID explicitly. Same reason as the CC path above. */
+ err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+ CBQRI_BC_MON_CTL_OP_READ_COUNTER, rmid,
+ CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+ if (err)
+ goto out_bc;
+
+ ctr_val = ioread64(bc->base + CBQRI_BC_MON_CTR_VAL_OFF);
+
+ if (ctr_val & CBQRI_BC_MON_CTR_VAL_INVALID) {
+ /*
+ * Return the last good total and leave prev_ctr so
+ * the next valid sample resumes from there.
+ */
+ *val = bc->mbm_total_states[rmid].chunks;
+ } else if (ctr_val & CBQRI_BC_MON_CTR_VAL_OVF) {
+ /*
+ * OVF is sticky until next CONFIG_EVENT.
+ * cbqri_bc_mon_overflow() can recover at most
+ * one wrap. With OVF set, the count is unknown,
+ * so re-arm and re-anchor prev_ctr=0.
+ */
+ struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid];
+
+ pr_warn_ratelimited("BC@%pa MCID %u: bandwidth counter overflow\n",
+ &bc->addr, rmid);
+ err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+ CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid,
+ CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+ if (err)
+ goto out_bc;
+
+ s->prev_ctr = 0;
+ *val = s->chunks;
+ } else {
+ struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid];
+ u64 cur = ctr_val & CBQRI_BC_MON_CTR_VAL_CTR_MASK;
+
+ s->chunks += cbqri_bc_mon_overflow(s->prev_ctr, cur);
+ s->prev_ctr = cur;
+ *val = s->chunks;
+ }
+out_bc:
+ mutex_unlock(&bc->lock);
+ break;
+ }
+
default:
err = -EINVAL;
break;
@@ -738,6 +853,15 @@ static int cbqri_resctrl_control_init(struct cbqri_resctrl_res *cbqri_res)
res->mon.num_rmid = ctrl->mcid_count;
resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID,
false, 0, NULL);
+
+ /*
+ * Expose BC bandwidth monitoring as the L3's
+ * mbm_total_bytes when they share topology.
+ */
+ if (cbqri_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID])
+ resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID,
+ false, 0, NULL);
+
res->mon_capable = true;
}
break;
@@ -824,15 +948,54 @@ static int cbqri_resctrl_pick_bw_alloc(void)
}
/*
- * Pick one controller per monitoring event. L3 OCCUP comes from the
- * picked L3 CC (if mon_capable).
+ * Pick one controller per monitoring event. L3 OCCUP comes from the
+ * picked L3 CC if mon_capable. MBM_TOTAL from the only mon-capable BC,
+ * but only when the system exposes exactly one L3 cache. Pairing a
+ * single BC with multiple L3 domains would overcount system bandwidth
+ * by a factor equal to the L3 domain count.
*/
static void cbqri_resctrl_pick_counters(void)
{
struct cbqri_resctrl_res *l3 = &cbqri_resctrl_resources[RDT_RESOURCE_L3];
+ struct cbqri_controller *ctrl, *prev;
+ unsigned int l3_count = 0;
if (l3->ctrl && l3->ctrl->mon_capable)
cbqri_resctrl_counters[QOS_L3_OCCUP_EVENT_ID] = l3->ctrl;
+
+ /* Count distinct L3 cache_ids */
+ list_for_each_entry(ctrl, &cbqri_controllers, list) {
+ bool seen = false;
+
+ if (ctrl->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+ continue;
+ if (ctrl->cache.cache_level != 3)
+ continue;
+
+ list_for_each_entry(prev, &cbqri_controllers, list) {
+ if (prev == ctrl)
+ break;
+ if (prev->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+ continue;
+ if (prev->cache.cache_level != 3)
+ continue;
+ if (prev->cache.cache_id == ctrl->cache.cache_id) {
+ seen = true;
+ break;
+ }
+ }
+ if (!seen)
+ l3_count++;
+ }
+
+ if (l3_count > 1) {
+ pr_warn_once("multiple L3 domains (%u) detected. mbm_total_bytes disabled\n",
+ l3_count);
+ return;
+ }
+
+ cbqri_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID] =
+ cbqri_find_only_mon_bc();
}
static void cbqri_resctrl_accumulate_caps(void)
@@ -948,6 +1111,26 @@ static int cbqri_attach_cpu_to_l3_mon(struct cbqri_controller *ctrl,
else
list_add_tail(&mon_dom->hdr.list, &res->mon_domains);
+ /*
+ * Pair this L3 domain with the system's mon-capable BC and
+ * initialise the BC's per-MCID software accumulators before
+ * resctrl_online_mon_domain() exposes the domain to userspace.
+ * A concurrent sysfs read of mbm_total_bytes between online and
+ * BC init would otherwise pass the !bc->mbm_total_states check
+ * with a half-initialised pointer.
+ */
+ hw_dom = container_of(ctrl_dom, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+
+ hw_dom->paired_bc = cbqri_find_only_mon_bc();
+ if (hw_dom->paired_bc) {
+ err = cbqri_init_bc_mon_counters(hw_dom->paired_bc);
+ if (err) {
+ pr_err("BC @%pa: mon init failed (%d)\n", &hw_dom->paired_bc->addr, err);
+ hw_dom->paired_bc = NULL;
+ goto err_listdel;
+ }
+ }
+
err = resctrl_online_mon_domain(res, &mon_dom->hdr);
if (err)
goto err_listdel;
--
2.43.0