[PATCH 4/7] x86/intel_rdt: Implement scheduling support for Intel RDT

From: Vikas Shivappa
Date: Mon May 11 2015 - 15:04:41 EST


Adds support for IA32_PQR_ASSOC MSR writes during task scheduling.
For Cache Allocation, MSR write would let the task fill in the cache
'subset' represented by the cgroup's cache_mask.

The high 32 bits in the per processor MSR IA32_PQR_ASSOC represents the
CLOSid. During context switch kernel implements this by writing the
CLOSid of the cgroup to which the task belongs to the CPU's
IA32_PQR_ASSOC MSR.

The following considerations are done for the PQR MSR write so that it
minimally impacts scheduler hot path:
- This path does not exist on any non-intel platforms.
- On Intel platforms, this would not exist by default unless CGROUP_RDT
is enabled.
- remains a no-op when CGROUP_RDT is enabled and intel SKU does not
support the feature.
- When feature is available and enabled, never does MSR write till the
user manually creates a cgroup directory *and* assigns a cache_mask
different from root cgroup directory. Since the child node inherits the
parents cache mask , by cgroup creation there is no scheduling hot path
impact from the new cgroup.
- MSR write is only done when there is a task with different Closid is
scheduled on the CPU. Typically if the task groups are bound to be
scheduled on a set of CPUs , the number of MSR writes is greatly
reduced.
- For cgroup directories having same cache_mask the CLOSids are reused.
This minimizes the number of CLOSids used and hence reduces the MSR
write frequency.

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/intel_rdt.h | 44 ++++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/switch_to.h | 3 +++
arch/x86/kernel/cpu/intel_rdt.c | 30 +++++++++++++++++++++++++++
3 files changed, 77 insertions(+)

diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 9e9dbbe..589394b 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,9 +4,15 @@
#ifdef CONFIG_CGROUP_RDT

#include <linux/cgroup.h>
+
+#define MSR_IA32_PQR_ASSOC 0xc8f
#define MAX_CBM_LENGTH 32
#define IA32_L3_CBM_BASE 0xc90
#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)
+DECLARE_PER_CPU(unsigned int, x86_cpu_clos);
+extern struct static_key rdt_enable_key;
+extern void __rdt_sched_in(void);
+

struct rdt_subsys_info {
/* Clos Bitmap to keep track of available CLOSids.*/
@@ -24,6 +30,11 @@ struct clos_cbm_map {
unsigned int clos_refcnt;
};

+static inline bool rdt_enabled(void)
+{
+ return static_key_false(&rdt_enable_key);
+}
+
/*
* Return rdt group corresponding to this container.
*/
@@ -37,5 +48,38 @@ static inline struct intel_rdt *parent_rdt(struct intel_rdt *ir)
return css_rdt(ir->css.parent);
}

+/*
+ * Return rdt group to which this task belongs.
+ */
+static inline struct intel_rdt *task_rdt(struct task_struct *task)
+{
+ return css_rdt(task_css(task, intel_rdt_cgrp_id));
+}
+
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports L3 cache allocation.
+ * - When support is present and enabled, does not do any
+ * IA32_PQR_MSR writes until the user starts really using the feature
+ * ie creates a rdt cgroup directory and assigns a cache_mask thats
+ * different from the root cgroup's cache_mask.
+ * - Closids are allocated so that different cgroup directories
+ * with same cache_mask gets the same CLOSid. This minimizes CLOSids
+ * used and reduces MSR write frequency.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+ if (rdt_enabled())
+ __rdt_sched_in();
+}
+
+#else
+
+static inline void intel_rdt_sched_in(struct task_struct *task) {}
+
#endif
#endif
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 751bf4b..9149577 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,9 @@ struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);

+#include <asm/intel_rdt.h>
+#define finish_arch_switch(prev) intel_rdt_sched_in()
+
#ifdef CONFIG_X86_32

#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 125318d..fe3ce4e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -35,6 +35,8 @@ static struct clos_cbm_map *ccmap;
static struct rdt_subsys_info rdtss_info;
static DEFINE_MUTEX(rdt_group_mutex);
struct intel_rdt rdt_root_group;
+struct static_key __read_mostly rdt_enable_key = STATIC_KEY_INIT_FALSE;
+DEFINE_PER_CPU(unsigned int, x86_cpu_clos);

/*
* Mask of CPUs for writing CBM values. We only need one per-socket.
@@ -79,6 +81,33 @@ static void intel_rdt_free_closid(unsigned int clos)
clear_bit(clos, rdtss_info.closmap);
}

+void __rdt_sched_in(void)
+{
+ struct task_struct *task = current;
+ struct intel_rdt *ir;
+ unsigned int clos;
+
+ /*
+ * This needs to be fixed
+ * to cache the whole PQR instead of just CLOSid.
+ * PQR has closid in high 32 bits and CQM-RMID in low 10 bits.
+ * Should not write a 0 to the low 10 bits of PQR
+ * and corrupt RMID.
+ */
+ clos = this_cpu_read(x86_cpu_clos);
+
+ rcu_read_lock();
+ ir = task_rdt(task);
+ if (ir->clos == clos) {
+ rcu_read_unlock();
+ return;
+ }
+
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, ir->clos);
+ this_cpu_write(x86_cpu_clos, ir->clos);
+ rcu_read_unlock();
+}
+
static void __clos_get(unsigned int closid)
{
struct clos_cbm_map *ccm = &ccmap[closid];
@@ -433,6 +462,7 @@ static int __init intel_rdt_late_init(void)
__hotcpu_notifier(intel_rdt_cpu_notifier, 0);

cpu_notifier_register_done();
+ static_key_slow_inc(&rdt_enable_key);

pr_info("Max bitmask length:%u,Max ClosIds: %u\n", max_cbm_len, maxid);
out_err:
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/