[PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

From: Kanaka Juvva
Date: Sat Aug 08 2015 - 02:40:42 EST


This patch adds Memory Bandwidth Monitoring events to the exitsing
intel_cqm pmu in the Linux Kernel.

Intel MBM builds on Cache Monitoring Technology (CMT) infrastructure
to allow monitoring of bandwidth from one level of the cache hierarchy
to the next - in this case focusing on the L3 cache, which is typically
backed directly by system memory. As a result of this implementation,
memory bandwidth can be monitored.

MBM counters are available in Intel Xeon CPUs. The following events are
implemented in the kernel and expose two MBM counters via perf_event
interface:

- llc_local_bw: bandwidth consumption of all coress on socket
- llc_total_bw: bandwidth consumption of all cores on socket + bandwidth
for QPI accesses

At present, the MBM events are checked at one second interval provided
by the HRTIMER of the MBM event. MBM counters can overflow atmost once in
a second and thus must be read atleast once in a second. Overflow is
detected and handled. Cumulative average value is calculated for each
bandwidth type upon reading a new value from the MSR.

version2: merged MBM patches with tip
changes: 1. Incorporate HRTIMER API chanes of tip
2. added separate event MBM_LOCAL_EVENT codes for perf
and MSR reads
QOS_MBM_LOCAL_EVENT_ID_HW 0x3 used for MSR read
as per SDM
QOS_MBM_LOCAL_EVENT_ID (1 << 2) used by perf,
perf event codes are in powers of 2

version3: Improved readbility of code
Incorporated upstream comments for Patch v2

Signed-off-by: Kanaka Juvva <kanaka.d.juvva@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/cpufeature.h | 2 +
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 702 +++++++++++++++++++++++++++--
3 files changed, 672 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606f..c36abdf 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -251,6 +251,8 @@

/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+2) /* LLC Local MBM monitoring */

/*
* BUG word(s)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0..7dcc88c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -643,7 +643,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[12] = edx;
- if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC))
+ || ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL))
+ || (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 63eb68b..d54543f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -12,9 +12,19 @@
#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
+#define MBM_CNTR_MAX 0xffffff
+#define MBM_SOCKET_MAX 8
+#define MBM_TIME_DELTA_MAX 1000
+#define MBM_TIME_DELTA_MIN 100
+#define MBM_SCALING_FACTOR2 1
+#define MBM_SCALING_FACTOR 1000
+#define MBM_FIFO_SIZE_MIN 10
+#define MBM_FIFO_SIZE_MAX 300

static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN;
+static bool cqm_llc_occ, cqm_mbm;

/**
* struct intel_pqr_state - State cache for the PQR MSR
@@ -42,6 +52,90 @@ struct intel_pqr_state {
* interrupts disabled, which is sufficient for the protection.
*/
static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu);
+static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
+
+/*
+ * mbm pmu is used for storing mbm_local and mbm_total
+ * events
+ */
+struct mbm_pmu {
+ /*
+ * # of active events
+ */
+ int n_active;
+
+ /*
+ * Linked list that holds the perf events
+ */
+ struct list_head active_list;
+
+ /*
+ * pmu to which event belongs
+ */
+ struct pmu *pmu;
+
+ /*
+ * time interval in ms between two consecutive samples
+ */
+ ktime_t timer_interval; /* in ktime_t unit */
+
+ /*
+ * periodic timer that triggers inte_cqm_mbm_update
+ */
+ struct hrtimer hrtimer;
+};
+
+/*
+ * struct sample defines mbm event
+ */
+struct sample {
+ /*
+ * current MSR value
+ */
+ u64 bytes;
+
+ /*
+ * running average of the bandwidth
+ */
+ u64 cum_avg;
+
+ /*
+ * time of the previous sample
+ */
+ ktime_t prev_time;
+
+ /*
+ * sample index
+ */
+ u64 index;
+
+ /*
+ * Sliding window to store previous 'n' bw values
+ */
+ u32 mbmfifo[MBM_FIFO_SIZE_MAX];
+
+ /*
+ * fifoin is the location at which current bw is saved
+ */
+ u32 fifoin;
+
+ /*
+ * fifoout is the start of the sliding window from which last 'n'
+ * bw values must be read
+ */
+ u32 fifoout; /* mbmfifo out counter for sliding window */
+};
+
+/*
+ * stats for total bw
+ */
+static struct sample *mbm_total; /* curent stats for mbm_total events */
+
+/*
+ * stats for local bw
+ */
+static struct sample *mbm_local; /* current stats for mbm_local events */

/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -66,6 +160,9 @@ static cpumask_t cqm_cpumask;
#define RMID_VAL_UNAVAIL (1ULL << 62)

#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
+#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
+#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)

#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID

@@ -176,6 +273,25 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
return entry;
}

+/**
+ * __mbm_reset_stats - reset stats for a given rmid for the current cpu
+ * @rmid: rmid value
+ *
+ * vrmid: array index for current core for the given rmid
+ * mbs_total[] and mbm_local[] are linearly indexed by core * max #rmids per
+ * core
+ */
+static void __mbm_reset_stats(u32 rmid)
+{
+ u32 vrmid = topology_physical_package_id(smp_processor_id()) *
+ cqm_max_rmid + rmid;
+
+ if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
+ return;
+ memset(&mbm_local[vrmid], 0, sizeof(struct sample));
+ memset(&mbm_total[vrmid], 0, sizeof(struct sample));
+}
+
/*
* Returns < 0 on fail.
*
@@ -192,6 +308,7 @@ static u32 __get_rmid(void)

entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
list_del(&entry->list);
+ __mbm_reset_stats(entry->rmid);

return entry->rmid;
}
@@ -207,6 +324,7 @@ static void __put_rmid(u32 rmid)

entry->queue_time = jiffies;
entry->state = RMID_YOUNG;
+ __mbm_reset_stats(rmid);

list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
}
@@ -232,6 +350,7 @@ static int intel_cqm_setup_rmid_cache(void)

INIT_LIST_HEAD(&entry->list);
entry->rmid = r;
+
cqm_rmid_ptrs[r] = entry;

list_add_tail(&entry->list, &cqm_rmid_free_lru);
@@ -494,6 +613,165 @@ static bool intel_cqm_sched_in_event(u32 rmid)
return false;
}

+
+static u32 bw_sum_calc(struct sample *bw_stat, int rmid)
+{
+ u32 val = 0, i, j, index;
+
+ if (++bw_stat->fifoout >= mbm_window_size)
+ bw_stat->fifoout = 0;
+ index = bw_stat->fifoout;
+ for (i = 0; i < mbm_window_size - 1; i++) {
+ if (index + i >= mbm_window_size)
+ j = index + i - mbm_window_size;
+ else
+ j = index + i;
+ val += bw_stat->mbmfifo[j];
+ }
+ return val;
+}
+
+static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
+{
+ if (is_localbw)
+ return bw_sum_calc(&mbm_local[rmid], rmid);
+ else
+ return bw_sum_calc(&mbm_total[rmid], rmid);
+}
+
+static void __mbm_fifo_in(struct sample *bw_stat, u32 val)
+{
+ bw_stat->mbmfifo[bw_stat->fifoin] = val;
+ if (++bw_stat->fifoin >= mbm_window_size)
+ bw_stat->fifoin = 0;
+}
+
+static void mbm_fifo_in(int rmid, u32 val, bool is_localbw)
+{
+ if (is_localbw)
+ __mbm_fifo_in(&mbm_local[rmid], val);
+ else
+ __mbm_fifo_in(&mbm_total[rmid], val);
+}
+
+/*
+ * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and reads
+ * its MSR counter. Check whether overflow occurred and handles it. Calculates
+ * currenet BW and updates running average.
+ *
+ * Overflow Handling:
+ * if (MSR current value < MSR previous value) it is an
+ * overflow. and overflow is handled.
+ *
+ * Calculation of Current BW value:
+ * If MSR is read within last 100ms, then the value is ignored;
+ * this will suppress small deltas. We don't process MBM samples that are
+ * within 100ms.
+ *
+ * Bandwidth is calculated as:
+ * bandwidth = difference of last two msr counter values/time difference.
+ *
+ * cum_avg = Running Average bandwidth of last 'n' samples that are processed
+ *
+ * Sliding window is used to save the last 'n' samples. Hence,
+ * n = sliding_window_size
+ *
+ *Scaling:
+ * cum_avg is scaled down by a factor MBM_SCALING_FACTOR2 and rounded to
+ * nearest integer. User interface reads the BW in KB/sec or MB/sec.
+ */
+static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
+{
+ u64 val, tmp, diff_time, cma, bytes, index;
+ bool overflow = false, first = false;
+ ktime_t cur_time;
+ u32 tmp32 = rmid;
+ struct sample *mbm_current;
+ u32 vrmid = topology_physical_package_id(smp_processor_id()) *
+ cqm_max_rmid + rmid;
+
+ rmid = vrmid;
+ cur_time = ktime_get();
+ if (read_mbm_local) {
+ cma = mbm_local[rmid].cum_avg;
+ diff_time = ktime_ms_delta(cur_time,
+ mbm_local[rmid].prev_time);
+ if (diff_time < 100)
+ return cma;
+ mbm_local[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ bytes = mbm_local[rmid].bytes;
+ index = mbm_local[rmid].index;
+ mbm_current = &mbm_local[rmid];
+ rmid = tmp32;
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW, rmid);
+ } else {
+ cma = mbm_total[rmid].cum_avg;
+ diff_time = ktime_ms_delta(cur_time,
+ mbm_total[rmid].prev_time);
+ if (diff_time < 100)
+ return cma;
+ mbm_total[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ bytes = mbm_total[rmid].bytes;
+ index = mbm_total[rmid].index;
+ mbm_current = &mbm_total[rmid];
+ rmid = tmp32;
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID, rmid);
+ }
+ rdmsrl(MSR_IA32_QM_CTR, val);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return val;
+
+ tmp = val;
+ /* if current msr value < previous msr value , it means overflow */
+ if (val < bytes) {
+ val = MBM_CNTR_MAX - bytes + val;
+ overflow = true;
+ } else
+ val = val - bytes;
+
+ val = (val * MBM_TIME_DELTA_MAX) / diff_time;
+
+ if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
+ /* First sample */
+ first = true;
+
+ rmid = vrmid;
+ if ((diff_time <= (MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)) ||
+ overflow || first) {
+ int bw, ret;
+
+ if (index & (index < mbm_window_size))
+ val = cma * MBM_SCALING_FACTOR2 + val / index -
+ cma / index;
+ val = DIV_ROUND_UP(val, MBM_SCALING_FACTOR2);
+ if (index >= mbm_window_size) {
+ /*
+ * Compute the sum of recent n-1 samples and slide the
+ * window by 1
+ */
+ ret = __mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
+ /* recalculate the running average with current bw */
+ ret = (ret + val) / mbm_window_size;
+ bw = val;
+ val = ret;
+ } else
+ bw = val;
+ /* save the recent bw in fifo */
+ mbm_fifo_in(rmid, bw, read_mbm_local);
+ /* save the current sample */
+ mbm_current->index++;
+ mbm_current->cum_avg = val;
+ mbm_current->bytes = tmp;
+ mbm_current->prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+
+ return val;
+ }
+ return cma;
+}
+
/*
* Initially use this constant for both the limbo queue time and the
* rotation timer interval, pmu::hrtimer_interval_ms.
@@ -875,6 +1153,40 @@ static void intel_cqm_setup_event(struct perf_event *event,
event->hw.cqm_rmid = rmid;
}

+static void intel_cqm_event_update(struct perf_event *event)
+{
+ unsigned int rmid;
+ u64 val = 0;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ rmid = event->hw.cqm_rmid;
+ if (!__rmid_valid(rmid))
+ return;
+
+ switch (event->attr.config) {
+ case QOS_MBM_TOTAL_EVENT_ID:
+ val = __rmid_read_mbm(rmid, false);
+ break;
+ case QOS_MBM_LOCAL_EVENT_ID:
+ val = __rmid_read_mbm(rmid, true);
+ break;
+ default:
+ return;
+ }
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ local64_set(&event->count, val);
+}
+
static void intel_cqm_event_read(struct perf_event *event)
{
unsigned long flags;
@@ -887,6 +1199,13 @@ static void intel_cqm_event_read(struct perf_event *event)
if (event->cpu == -1)
return;

+ if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
+ intel_cqm_event_update(event);
+
+ if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
+ return;
+
raw_spin_lock_irqsave(&cache_lock, flags);
rmid = event->hw.cqm_rmid;

@@ -906,6 +1225,28 @@ out:
raw_spin_unlock_irqrestore(&cache_lock, flags);
}

+static void __intel_cqm_event_total_bw_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read_mbm(rr->rmid, false);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
+static void __intel_cqm_event_local_bw_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read_mbm(rr->rmid, true);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
static void __intel_cqm_event_count(void *info)
{
struct rmid_read *rr = info;
@@ -975,7 +1316,21 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!__rmid_valid(rr.rmid))
goto out;

- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ switch (event->attr.config) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ break;
+ case QOS_MBM_TOTAL_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_total_bw_count,
+ &rr, 1);
+ break;
+ case QOS_MBM_LOCAL_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_local_bw_count,
+ &rr, 1);
+ break;
+ default:
+ goto out;
+ }

raw_spin_lock_irqsave(&cache_lock, flags);
if (event->hw.cqm_rmid == rr.rmid)
@@ -985,6 +1340,39 @@ out:
return __perf_event_count(event);
}

+static void mbm_start_hrtimer(struct mbm_pmu *pmu)
+{
+ hrtimer_start_range_ns(&(pmu->hrtimer),
+ pmu->timer_interval, 0,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void mbm_stop_hrtimer(struct mbm_pmu *pmu)
+{
+ hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
+ struct perf_event *event;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+ list_for_each_entry(event, &pmu->active_list, active_entry)
+ intel_cqm_event_update(event);
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+ return HRTIMER_RESTART;
+}
+
+static void mbm_hrtimer_init(struct mbm_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = mbm_hrtimer_handle;
+}
+
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
@@ -1003,19 +1391,45 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
}

state->rmid = rmid;
+ if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
+ struct cqm_rmid_entry *entry;
+
+ entry = __rmid_entry(rmid);
+ }
wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+
+ if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) && (cqm_mbm)) {
+ int cpu = smp_processor_id();
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
+
+ if (pmu) {
+ pmu->n_active++;
+ list_add_tail(&event->active_entry,
+ &pmu->active_list);
+ if (pmu->n_active == 1)
+ mbm_start_hrtimer(pmu);
+ intel_cqm_event_update(event);
+ }
+ }
}

static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);

if (event->hw.cqm_state & PERF_HES_STOPPED)
return;

event->hw.cqm_state |= PERF_HES_STOPPED;

- intel_cqm_event_read(event);
+ if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
+ intel_cqm_event_read(event);
+
+ if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
+ intel_cqm_event_update(event);

if (!--state->rmid_usecnt) {
state->rmid = 0;
@@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
} else {
WARN_ON_ONCE(!state->rmid);
}
+
+ if (pmu) {
+ if (pmu->n_active > 0) {
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ mbm_stop_hrtimer(pmu);
+ if (!list_empty(&event->active_entry))
+ list_del(&event->active_entry);
+ }
+ }
+
}

static int intel_cqm_event_add(struct perf_event *event, int mode)
@@ -1090,7 +1515,9 @@ static int intel_cqm_event_init(struct perf_event *event)
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;

- if (event->attr.config & ~QOS_EVENT_MASK)
+ if (((event->attr.config & ~QOS_EVENT_MASK) &&
+ (event->attr.config & ~QOS_MBM_TOTAL_EVENT_ID)) &&
+ (event->attr.config & ~QOS_MBM_LOCAL_EVENT_ID))
return -EINVAL;

/* unsupported modes and filters */
@@ -1145,18 +1572,68 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");

-static struct attribute *intel_cqm_events_attr[] = {
+EVENT_ATTR_STR(llc_total_bw, intel_cqm_llc_total_bw, "event=0x02");
+EVENT_ATTR_STR(llc_total_bw.per-pkg, intel_cqm_llc_total_bw_pkg, "1");
+#if MBM_SCALING_FACTOR2 == 1000
+EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "MB/sec");
+EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "MB/sec");
+#else
+EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "KB/sec");
+EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "KB/sec");
+#endif
+EVENT_ATTR_STR(llc_total_bw.scale, intel_cqm_llc_total_bw_scale, NULL);
+EVENT_ATTR_STR(llc_total_bw.snapshot, intel_cqm_llc_total_bw_snapshot, "1");
+
+EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
+EVENT_ATTR_STR(llc_local_bw.per-pkg, intel_cqm_llc_local_bw_pkg, "1");
+EVENT_ATTR_STR(llc_local_bw.scale, intel_cqm_llc_local_bw_scale, NULL);
+EVENT_ATTR_STR(llc_local_bw.snapshot, intel_cqm_llc_local_bw_snapshot, "1");
+
+static struct attribute *intel_cmt_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute *intel_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc_total_bw),
+ EVENT_PTR(intel_cqm_llc_local_bw),
+ EVENT_PTR(intel_cqm_llc_total_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_local_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_total_bw_unit),
+ EVENT_PTR(intel_cqm_llc_local_bw_unit),
+ EVENT_PTR(intel_cqm_llc_total_bw_scale),
+ EVENT_PTR(intel_cqm_llc_local_bw_scale),
+ EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
+ EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
+ NULL,
+};
+
+static struct attribute *intel_cmt_mbm_events_attr[] = {
EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_total_bw),
+ EVENT_PTR(intel_cqm_llc_local_bw),
EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_total_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_local_bw_pkg),
EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_total_bw_unit),
+ EVENT_PTR(intel_cqm_llc_local_bw_unit),
EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_total_bw_scale),
+ EVENT_PTR(intel_cqm_llc_local_bw_scale),
EVENT_PTR(intel_cqm_llc_snapshot),
+ EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
+ EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
NULL,
};

static struct attribute_group intel_cqm_events_group = {
.name = "events",
- .attrs = intel_cqm_events_attr,
+ .attrs = NULL,
};

PMU_FORMAT_ATTR(event, "config:0-7");
@@ -1184,6 +1661,19 @@ max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
}

static ssize_t
+sliding_window_size_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", mbm_window_size);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
max_recycle_threshold_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1211,10 +1701,32 @@ max_recycle_threshold_store(struct device *dev,
return count;
}

+static ssize_t
+sliding_window_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+ if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
+ mbm_window_size = bytes;
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
static DEVICE_ATTR_RW(max_recycle_threshold);
+static DEVICE_ATTR_RW(sliding_window_size);

static struct attribute *intel_cqm_attrs[] = {
&dev_attr_max_recycle_threshold.attr,
+ &dev_attr_sliding_window_size.attr,
NULL,
};

@@ -1255,10 +1767,11 @@ static inline void cqm_pick_event_reader(int cpu)
cpumask_set_cpu(cpu, &cqm_cpumask);
}

-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static int intel_cqm_cpu_prepare(unsigned int cpu)
{
struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);

state->rmid = 0;
state->closid = 0;
@@ -1266,12 +1779,27 @@ static void intel_cqm_cpu_prepare(unsigned int cpu)

WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+
+ if ((!pmu) && (cqm_mbm)) {
+ pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL, NUMA_NO_NODE);
+ if (!pmu)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &intel_cqm_pmu;
+ pmu->n_active = 0;
+ pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
+ per_cpu(mbm_pmu, cpu) = pmu;
+ per_cpu(mbm_pmu_to_free, cpu) = NULL;
+ mbm_hrtimer_init(pmu);
+ }
+ return 0;
}

static void intel_cqm_cpu_exit(unsigned int cpu)
{
int phys_id = topology_physical_package_id(cpu);
int i;
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);

/*
* Is @cpu a designated cqm reader?
@@ -1288,6 +1816,36 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
break;
}
}
+
+ /* cancel overflow polling timer for CPU */
+ if (pmu)
+ mbm_stop_hrtimer(pmu);
+ kfree(mbm_local);
+ kfree(mbm_total);
+
+}
+
+static void mbm_cpu_kfree(int cpu)
+{
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu_to_free, cpu);
+
+ kfree(pmu);
+
+ per_cpu(mbm_pmu_to_free, cpu) = NULL;
+}
+
+static int mbm_cpu_dying(int cpu)
+{
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
+
+ if (!pmu)
+ return 0;
+
+ per_cpu(mbm_pmu, cpu) = NULL;
+
+ per_cpu(mbm_pmu_to_free, cpu) = pmu;
+
+ return 0;
}

static int intel_cqm_cpu_notifier(struct notifier_block *nb,
@@ -1297,7 +1855,7 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,

switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- intel_cqm_cpu_prepare(cpu);
+ return intel_cqm_cpu_prepare(cpu);
break;
case CPU_DOWN_PREPARE:
intel_cqm_cpu_exit(cpu);
@@ -1305,6 +1863,14 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
case CPU_STARTING:
cqm_pick_event_reader(cpu);
break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ mbm_cpu_dying(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ mbm_cpu_kfree(cpu);
+ break;
}

return NOTIFY_OK;
@@ -1315,15 +1881,74 @@ static const struct x86_cpu_id intel_cqm_match[] = {
{}
};

+static const struct x86_cpu_id intel_mbm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
+ {}
+};
+
static int __init intel_cqm_init(void)
{
char *str, scale[20];
- int i, cpu, ret;
+ int i, cpu, ret, array_size;

- if (!x86_match_cpu(intel_cqm_match))
+ if (!x86_match_cpu(intel_cqm_match) &&
+ (!x86_match_cpu(intel_mbm_match)))
return -ENODEV;

cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
+
+ if (x86_match_cpu(intel_cqm_match)) {
+ cqm_llc_occ = true;
+ intel_cqm_events_group.attrs = intel_cmt_events_attr;
+ }
+
+ if (x86_match_cpu(intel_mbm_match)) {
+ u32 mbm_scale_rounded = 0;
+
+ cqm_mbm = true;
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ /*
+ * MBM counter values are in bytes. To conver this to KB/sec
+ * or MB/sec, we scale the MBM scale factor by 1000. Another
+ * MBM_SCALING_FACTOR2 factor scaling down is done
+ * after reading the counter val i.e. in the function
+ * __rmid_read_mbm()
+ */
+ mbm_scale_rounded = DIV_ROUND_UP(cqm_l3_scale,
+ MBM_SCALING_FACTOR);
+ cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
+ snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (cqm_llc_occ)
+ intel_cqm_events_group.attrs =
+ intel_cmt_mbm_events_attr;
+ else
+ intel_cqm_events_group.attrs = intel_mbm_events_attr;
+
+ event_attr_intel_cqm_llc_local_bw_scale.event_str = str;
+ event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
+
+ array_size = (cqm_max_rmid + 1) * MBM_SOCKET_MAX;
+ mbm_local = kzalloc_node(sizeof(struct sample) * array_size,
+ GFP_KERNEL, NUMA_NO_NODE);
+ if (!mbm_local) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ mbm_total = kzalloc_node(sizeof(struct sample) * array_size,
+ GFP_KERNEL, NUMA_NO_NODE);
+ if (!mbm_total) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ } else
+ cqm_mbm = false;

/*
* It's possible that not all resources support the same number
@@ -1336,44 +1961,48 @@ static int __init intel_cqm_init(void)
*/
cpu_notifier_register_begin();

- for_each_online_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ if (cqm_llc_occ) {
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);

- if (c->x86_cache_max_rmid < cqm_max_rmid)
- cqm_max_rmid = c->x86_cache_max_rmid;
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;

- if (c->x86_cache_occ_scale != cqm_l3_scale) {
- pr_err("Multiple LLC scale values, disabling\n");
- ret = -EINVAL;
- goto out;
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
}
- }

- /*
- * A reasonable upper limit on the max threshold is the number
- * of lines tagged per RMID if all RMIDs have the same number of
- * lines tagged in the LLC.
- *
- * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
- */
- __intel_cqm_max_threshold =
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);

- snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
- str = kstrdup(scale, GFP_KERNEL);
- if (!str) {
- ret = -ENOMEM;
- goto out;
- }
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }

- event_attr_intel_cqm_llc_scale.event_str = str;
+ event_attr_intel_cqm_llc_scale.event_str = str;
+ }

ret = intel_cqm_setup_rmid_cache();
if (ret)
goto out;

for_each_online_cpu(i) {
- intel_cqm_cpu_prepare(i);
+ ret = intel_cqm_cpu_prepare(i);
+ if (ret)
+ goto out_free;
cqm_pick_event_reader(i);
}

@@ -1384,7 +2013,10 @@ static int __init intel_cqm_init(void)
pr_err("Intel CQM perf registration failed: %d\n", ret);
else
pr_info("Intel CQM monitoring enabled\n");
-
+ goto out;
+out_free:
+ kfree(mbm_local);
+ kfree(mbm_total);
out:
cpu_notifier_register_done();

--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/