[tip:x86/cache] x86/intel_rdt/cqm: Add RDT monitoring initialization

From: tip-bot for Vikas Shivappa
Date: Tue Aug 01 2017 - 16:50:37 EST


Commit-ID: 6a445edce657810992594c7b9e679219aaf78ad9
Gitweb: http://git.kernel.org/tip/6a445edce657810992594c7b9e679219aaf78ad9
Author: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
AuthorDate: Tue, 25 Jul 2017 14:14:27 -0700
Committer: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
CommitDate: Tue, 1 Aug 2017 22:41:21 +0200

x86/intel_rdt/cqm: Add RDT monitoring initialization

Add common data structures for RDT resource monitoring and perform RDT
monitoring related data structure initializations which include setting
up the RMID(Resource monitoring ID) lists and event list which the
resource supports.

[ tony: some cleanup to make adding MBM easier later, remove "cqm" from
some names, make some data structure local to intel_rdt_monitor.c
static. Add copyright header]

[ tglx: Made it readable ]

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: ravi.v.shankar@xxxxxxxxx
Cc: fenghua.yu@xxxxxxxxx
Cc: peterz@xxxxxxxxxxxxx
Cc: eranian@xxxxxxxxxx
Cc: vikas.shivappa@xxxxxxxxx
Cc: ak@xxxxxxxxxxxxxxx
Cc: davidcc@xxxxxxxxxx
Cc: reinette.chatre@xxxxxxxxx
Link: http://lkml.kernel.org/r/1501017287-28083-9-git-send-email-vikas.shivappa@xxxxxxxxxxxxxxx

---
arch/x86/kernel/cpu/Makefile | 2 +-
arch/x86/kernel/cpu/intel_rdt.c | 41 +++++++-
arch/x86/kernel/cpu/intel_rdt.h | 44 +++++++++
arch/x86/kernel/cpu/intel_rdt_monitor.c | 161 ++++++++++++++++++++++++++++++++
4 files changed, 242 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3622ca2..7584be0 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o

-obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o intel_rdt_monitor.o

obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 98715c5..cb520dd 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -55,6 +55,12 @@ DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
*/
int max_name_width, max_data_width;

+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
static void
mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
static void
@@ -235,7 +241,7 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return true;
}

-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_edx edx;
@@ -516,7 +522,7 @@ static __init void rdt_init_padding(void)
}
}

-static __init bool get_rdt_resources(void)
+static __init bool get_rdt_alloc_resources(void)
{
bool ret = false;

@@ -527,7 +533,7 @@ static __init bool get_rdt_resources(void)
return false;

if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
- rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
@@ -536,7 +542,7 @@ static __init bool get_rdt_resources(void)
}
if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
- rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
ret = true;
}

@@ -544,10 +550,32 @@ static __init bool get_rdt_resources(void)
if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
ret = true;
}
-
return ret;
}

+static __init bool get_rdt_mon_resources(void)
+{
+ if (boot_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+ rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+ if (boot_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (boot_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ if (!rdt_mon_features)
+ return false;
+
+ return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -573,6 +601,9 @@ static int __init intel_rdt_late_init(void)
for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);

+ for_each_mon_capable_rdt_resource(r)
+ pr_info("Intel RDT %s monitoring detected\n", r->name);
+
return 0;
}

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 29630af..993ab9d 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -12,6 +12,29 @@

#define L3_QOS_CDP_ENABLE 0x01ULL

+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid: event id
+ * @name: name of the event
+ */
+struct mon_evt {
+ u32 evtid;
+ char *name;
+ struct list_head list;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
/**
* struct rdtgroup - store rdtgroup's data in resctrl file system.
* @kn: kernfs node
@@ -133,10 +156,17 @@ struct rdt_membw {
u32 *mb_map;
};

+static inline bool is_llc_occupancy_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
/**
* struct rdt_resource - attributes of an RDT resource
* @alloc_enabled: Is allocation enabled on this machine
+ * @mon_enabled: Is monitoring enabled for this feature
* @alloc_capable: Is allocation available on this machine
+ * @mon_capable: Is monitor feature available on this machine
* @name: Name to use in "schemata" file
* @num_closid: Number of CLOSIDs available
* @cache_level: Which cache level defines scope of this resource
@@ -150,10 +180,15 @@ struct rdt_membw {
* @nr_info_files: Number of info files
* @format_str: Per resource format string to show domain value
* @parse_ctrlval: Per resource function pointer to parse control values
+ * @evt_list: List of monitoring events
+ * @num_rmid: Number of RMIDs available
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
*/
struct rdt_resource {
bool alloc_enabled;
+ bool mon_enabled;
bool alloc_capable;
+ bool mon_capable;
char *name;
int num_closid;
int cache_level;
@@ -170,6 +205,9 @@ struct rdt_resource {
const char *format_str;
int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
struct rdt_domain *d);
+ struct list_head evt_list;
+ int num_rmid;
+ unsigned int mon_scale;
};

void rdt_get_cache_infofile(struct rdt_resource *r);
@@ -201,6 +239,11 @@ enum {
r++) \
if (r->alloc_capable)

+#define for_each_mon_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_capable)
+
#define for_each_alloc_enabled_rdt_resource(r) \
for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
r++) \
@@ -239,5 +282,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
+int rdt_get_mon_l3_config(struct rdt_resource *r);

#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 0000000..f6d43c3
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,161 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@xxxxxxxxx>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+struct rmid_entry {
+ u32 rmid;
+ struct list_head list;
+};
+
+/**
+ * @rmid_free_lru A least recently used list of free RMIDs
+ * These RMIDs are guaranteed to have an occupancy less than the
+ * threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_lru list of currently unused but (potentially)
+ * dirty RMIDs.
+ * This list contains RMIDs that no one is currently using but that
+ * may have a occupancy value > intel_cqm_threshold. User can change
+ * the threshold occupancy value.
+ */
+static LIST_HEAD(rmid_limbo_lru);
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry *rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ entry = &rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+ struct rmid_entry *entry = NULL;
+ int i, nr_rmids;
+
+ nr_rmids = r->num_rmid;
+ rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_rmids; i++) {
+ entry = &rmid_ptrs[i];
+ INIT_LIST_HEAD(&entry->list);
+
+ entry->rmid = i;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+ INIT_LIST_HEAD(&r->evt_list);
+
+ if (is_llc_occupancy_enabled())
+ list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ int ret;
+
+ r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ intel_cqm_threshold /= r->mon_scale;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ r->mon_capable = true;
+ r->mon_enabled = true;
+
+ return 0;
+}