[PATCH 4/6] x86/mbm: Memory bandwidth monitoring event management

From: Vikas Shivappa
Date: Tue Mar 01 2016 - 18:48:39 EST


From: Tony Luck <tony.luck@xxxxxxxxx>

Includes all the core infrastructure to measure the total_bytes and
bandwidth.

We have per socket counters for both total system wide L3 external bytes
and local socket memory-controller bytes. The current b/w is calculated
for a minimum diff time(time since it was last counted) of 100ms. The OS
does MSR writes to MSR_IA32_QM_EVTSEL and MSR_IA32_QM_CTR to read the
counters and uses the IA32_PQR_ASSOC_MSR to associate the RMID with
the task. The tasks have a common RMID for cqm(cache quality of
service monitoring) and MBM. Hence most of the scheduling code is
reused from cqm.

Lot of the scheduling code was taken out from Tony's patch and a 3-4
lines of change were added in the intel_cqm_event_read. Since the timer
is no more added on every context switch this change was made.

Reviewed-by: Tony Luck <tony.luck@xxxxxxxxx>
Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 158 ++++++++++++++++++++++++++++-
1 file changed, 154 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index cf08a0f..6638dcc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,6 +13,11 @@
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d

+/*
+ * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter
+ * value
+ */
+#define MBM_CNTR_MAX 0xffffff
static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
static bool cqm_enabled, mbm_enabled;
@@ -68,6 +73,16 @@ static struct sample *mbm_total;
*/
static struct sample *mbm_local;

+#define pkg_id topology_physical_package_id(smp_processor_id())
+/*
+ * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
+ * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
+ * rmids per socket, an example is given below
+ * RMID1 of Socket0: vrmid = 1
+ * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
+ * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
+ */
+#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
* Also protects event->hw.cqm_rmid
@@ -91,8 +106,19 @@ static cpumask_t cqm_cpumask;
#define RMID_VAL_UNAVAIL (1ULL << 62)

#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+/*
+ * MBM Event IDs as defined in SDM section 17.15.5
+ * Event IDs are used to program EVTSEL MSRs before reading mbm event counters
+ */
+enum mbm_evt_type {
+ QOS_MBM_TOTAL_EVENT_ID = 0x02,
+ QOS_MBM_LOCAL_EVENT_ID,
+ QOS_MBM_TOTAL_BW_EVENT_ID,
+ QOS_MBM_LOCAL_BW_EVENT_ID,
+};

-#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+#define QOS_MBM_BW_EVENT_MASK 0x04
+#define QOS_MBM_LOCAL_EVENT_MASK 0x01

/*
* This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
@@ -422,9 +448,16 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
struct rmid_read {
u32 rmid;
atomic64_t value;
+ enum mbm_evt_type evt_type;
};

static void __intel_cqm_event_count(void *info);
+static void init_mbm_sample(u32 rmid, enum mbm_evt_type evt_type);
+
+static bool is_mbm_event(int e)
+{
+ return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_BW_EVENT_ID);
+}

/*
* Exchange the RMID of a group of events.
@@ -866,6 +899,98 @@ static void intel_cqm_rmid_rotate(struct work_struct *work)
schedule_delayed_work(&intel_cqm_rmid_work, delay);
}

+static struct sample *update_sample(unsigned int rmid,
+ enum mbm_evt_type evt_type, int first)
+{
+ ktime_t cur_time;
+ struct sample *mbm_current;
+ u32 vrmid = rmid_2_index(rmid);
+ u64 val, bytes, diff_time;
+ u32 eventid;
+
+ if (evt_type & QOS_MBM_LOCAL_EVENT_MASK) {
+ mbm_current = &mbm_local[vrmid];
+ eventid = QOS_MBM_LOCAL_EVENT_ID;
+ } else {
+ mbm_current = &mbm_total[vrmid];
+ eventid = QOS_MBM_TOTAL_EVENT_ID;
+ }
+
+ cur_time = ktime_get();
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return mbm_current;
+ val &= MBM_CNTR_MAX;
+
+ if (first) {
+ mbm_current->interval_start = cur_time;
+ mbm_current->prev_msr = val;
+ mbm_current->total_bytes = 0;
+ mbm_current->interval_bytes = 0;
+ mbm_current->bandwidth = 0;
+ return mbm_current;
+ }
+
+ if (val < mbm_current->prev_msr)
+ bytes = MBM_CNTR_MAX - mbm_current->prev_msr + val + 1;
+ else
+ bytes = val - mbm_current->prev_msr;
+ bytes *= cqm_l3_scale;
+
+ mbm_current->total_bytes += bytes;
+ mbm_current->interval_bytes += bytes;
+ mbm_current->prev_msr = val;
+ diff_time = ktime_ms_delta(cur_time, mbm_current->interval_start);
+
+ /*
+ * The b/w measured is really the most recent/current b/w.
+ * We wait till enough time has passed to avoid
+ * arthmetic rounding problems.Having it at >=100ms,
+ * such errors would be <=1%.
+ */
+ if (diff_time > 100) {
+ bytes = mbm_current->interval_bytes * MSEC_PER_SEC;
+ do_div(bytes, diff_time);
+ mbm_current->bandwidth = bytes;
+ mbm_current->interval_bytes = 0;
+ mbm_current->interval_start = cur_time;
+ }
+
+ return mbm_current;
+}
+
+static u64 rmid_read_mbm(unsigned int rmid, enum mbm_evt_type evt_type)
+{
+ struct sample *mbm_current;
+
+ mbm_current = update_sample(rmid, evt_type, 0);
+
+ if (evt_type & QOS_MBM_BW_EVENT_MASK)
+ return mbm_current->bandwidth;
+ else
+ return mbm_current->total_bytes;
+}
+
+static void __intel_mbm_event_init(void *info)
+{
+ struct rmid_read *rr = info;
+
+ update_sample(rr->rmid, rr->evt_type, 1);
+}
+
+static void init_mbm_sample(u32 rmid, enum mbm_evt_type evt_type)
+{
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ };
+
+ rr.rmid = rmid;
+ rr.evt_type = evt_type;
+ /* on each socket, init sample */
+ on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
+}
+
/*
* Find a group and setup RMID.
*
@@ -886,6 +1011,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
/* All tasks in a group share an RMID */
event->hw.cqm_rmid = rmid;
*group = iter;
+ if (is_mbm_event(event->attr.config))
+ init_mbm_sample(rmid, event->attr.config);
return;
}

@@ -902,6 +1029,9 @@ static void intel_cqm_setup_event(struct perf_event *event,
else
rmid = __get_rmid();

+ if (is_mbm_event(event->attr.config))
+ init_mbm_sample(rmid, event->attr.config);
+
event->hw.cqm_rmid = rmid;
}

@@ -923,7 +1053,10 @@ static void intel_cqm_event_read(struct perf_event *event)
if (!__rmid_valid(rmid))
goto out;

- val = __rmid_read(rmid);
+ if (is_mbm_event(event->attr.config))
+ val = rmid_read_mbm(rmid, event->attr.config);
+ else
+ val = __rmid_read(rmid);

/*
* Ignore this reading on error states and do not update the value.
@@ -954,6 +1087,17 @@ static inline bool cqm_group_leader(struct perf_event *event)
return !list_empty(&event->hw.cqm_groups_entry);
}

+static void __intel_mbm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = rmid_read_mbm(rr->rmid, rr->evt_type);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
static u64 intel_cqm_event_count(struct perf_event *event)
{
unsigned long flags;
@@ -1007,7 +1151,12 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!__rmid_valid(rr.rmid))
goto out;

- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ if (is_mbm_event(event->attr.config)) {
+ rr.evt_type = event->attr.config;
+ on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1);
+ } else {
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ }

raw_spin_lock_irqsave(&cache_lock, flags);
if (event->hw.cqm_rmid == rr.rmid)
@@ -1122,7 +1271,8 @@ static int intel_cqm_event_init(struct perf_event *event)
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;

- if (event->attr.config & ~QOS_EVENT_MASK)
+ if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
+ (event->attr.config > QOS_MBM_LOCAL_BW_EVENT_ID))
return -EINVAL;

/* unsupported modes and filters */
--
1.9.1