Re: [PATCH v9 4/9] x86/resctrl: Compute memory bandwidth for all supported events

From: Peter Newman
Date: Fri Nov 15 2024 - 08:54:06 EST


Hi Tony,

On Thu, Nov 14, 2024 at 1:17 AM Tony Luck <tony.luck@xxxxxxxxx> wrote:
>
> Computing the bandwidth for an event is cheap, and only done once
> per second. Doing so simplifies switching between events and allows
> choosing different events per ctrl_mon group.
>
> Suggested-by: Reinette Chatre <reinette.chatre@xxxxxxxxx>
> Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/resctrl/monitor.c | 72 ++++++++++++---------------
> 1 file changed, 33 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 2176e355e864..da4ae21350c8 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> */
> static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> {
> - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> - struct mbm_state *m = &rr->d->mbm_local[idx];
> u64 cur_bw, bytes, cur_bytes;
> + struct mbm_state *m;
> +
> + m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> + if (WARN_ON_ONCE(!m))
> + return;
>
> cur_bytes = rr->val;
> bytes = cur_bytes - m->prev_bw_bytes;
> @@ -826,54 +829,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
> resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
> }
>
> -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> - u32 closid, u32 rmid)
> +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
> + u32 closid, u32 rmid, enum resctrl_event_id evtid)
> {
> struct rmid_read rr = {0};
>
> rr.r = r;
> rr.d = d;
> + rr.evtid = evtid;
> + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> + if (IS_ERR(rr.arch_mon_ctx)) {
> + pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> + PTR_ERR(rr.arch_mon_ctx));
> + return;
> + }
> +
> + __mon_event_count(closid, rmid, &rr);
>
> /*
> - * This is protected from concurrent reads from user
> - * as both the user and we hold the global mutex.
> + * If the software controller is enabled, compute the
> + * bandwidth for this event id.
> */
> - if (is_mbm_total_enabled()) {
> - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> - rr.val = 0;
> - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> - if (IS_ERR(rr.arch_mon_ctx)) {
> - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> - PTR_ERR(rr.arch_mon_ctx));
> - return;
> - }
> -
> - __mon_event_count(closid, rmid, &rr);
> + if (is_mba_sc(NULL))
> + mbm_bw_count(closid, rmid, &rr);
>
> - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> - }
> - if (is_mbm_local_enabled()) {
> - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> - rr.val = 0;
> - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> - if (IS_ERR(rr.arch_mon_ctx)) {
> - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> - PTR_ERR(rr.arch_mon_ctx));
> - return;
> - }
> -
> - __mon_event_count(closid, rmid, &rr);
> + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> +}
>
> - /*
> - * Call the MBA software controller only for the
> - * control groups and when user has enabled
> - * the software controller explicitly.
> - */
> - if (is_mba_sc(NULL))
> - mbm_bw_count(closid, rmid, &rr);
> +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> + u32 closid, u32 rmid)
> +{
> + /*
> + * This is protected from concurrent reads from user
> + * as both the user and we hold the global mutex.
> + */
> + if (is_mbm_total_enabled())
> + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
>
> - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> - }
> + if (is_mbm_local_enabled())
> + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
> }
>
> /*
> --
> 2.47.0
>

I experimented with all-groups, per-domain counter aggregation files
prototype using this change as a starting point.

I'm happy to report that the values reported looked fairly reasonable.

Tested-by: Peter Newman <peternewman@xxxxxxxxxx>

I also did a quick experiment to see how close to 1 second apart the
readings were:

@@ -664,6 +664,7 @@ static int __mon_event_count(u32 closid, u32 rmid,
struct rmid_read *rr)
static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
u64 cur_bw, bytes, cur_bytes;
struct mbm_state *m;
+ u64 ts;

m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
@@ -680,6 +681,10 @@ static void mbm_bw_count(u32 closid, u32 rmid,
struct rmid_read *rr)
cur_bw = bytes / SZ_1M;

m->prev_bw = cur_bw;
+
+ ts = jiffies;
+ m->latency = ts - m->last_update_jiffies;
+ m->last_update_jiffies = ts;
}

On an AMD EPYC 7B12 64-Core Processor, I saw a consistent 1.021-1.026
second period. Is this enough error that you would want to divide by
the actual period instead of assuming a denominator of 1 exactly?
We're mainly concerned with the relative bandwidth of jobs, so this
error isn't much concern as long as it doesn't favor any group.

The only thing I'd worry about is if the user is using setitimer() to
keep a consistent 1 second period for reading the bandwidth rate, the
window of the resctrl updates would drift away from the userspace
consumer over time.

Thanks!
-Peter