[PATCH 07/10] x86/intel_rdt: Add support for cache bit mask management

From: Vikas Shivappa
Date: Wed Jun 03 2015 - 15:12:30 EST


The change adds a file cache_mask to the RDT cgroup which represents the
cache bit mask(CBM) for the cgroup. cache_mask is specific to the Cache
allocation sub-feature of RDT. The tasks in the RDT cgroup would get to
fill the L3 cache represented by the cgroup's cache_mask file.

Update to the CBM is done by writing to the IA32_L3_MASK_n. The RDT
cgroup follows cgroup hierarchy ,mkdir and adding tasks to the cgroup
never fails. When a child cgroup is created it inherits the CLOSid and
the cache_mask from its parent. When a user changes the default CBM for
a cgroup, a new CLOSid may be allocated if the cache_mask was not used
before. If the new CBM is the one that is already used, the count for
that CLOSid<->CBM is incremented. The changing of 'cache_mask' may fail
with -ENOSPC once the kernel runs out of maximum CLOSids it can support.

User can create as many cgroups as he wants but having different CBMs at
the same time is restricted by the maximum number of CLOSids .Kernel
maintains a CLOSid<->cbm mapping which keeps count of cgroups using a
CLOSid.

Reuse of CLOSids for cgroups with same bitmask also has following
advantages:
- This helps to use the scant CLOSids optimally.
- This also implies that during context switch, write to PQR-MSR is
done only when a task with a different bitmask is scheduled in.

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
Changes as per Thomas's feedback:

- changed the names of functions to only have intel_ prefix for external
APIs.
-replaced (void *)&closid with (void *)closid when calling
on_each_cpu_mask
-fixed the reference release of closid during cache bitmask write.
-changed the code to not ignore a cache mask which has bits set outside
of the max bits allowed. It returns an error instead.
-replaced bitmap_set(&max_mask, 0, max_cbm_len) with max_mask =
(1ULL << max_cbm) - 1.

arch/x86/include/asm/intel_rdt.h | 3 +
arch/x86/kernel/cpu/intel_rdt.c | 201 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 147c9cf..ba4601f 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,6 +4,9 @@
#ifdef CONFIG_CGROUP_RDT

#include <linux/cgroup.h>
+#define MAX_CBM_LENGTH 32
+#define IA32_L3_CBM_BASE 0xc90
+#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)

struct rdt_subsys_info {
unsigned long *closmap;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 7d455b9..f857381 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -34,6 +34,13 @@ static struct clos_cbm_map *ccmap;
static struct rdt_subsys_info rdtss_info;
static DEFINE_MUTEX(rdt_group_mutex);
struct intel_rdt rdt_root_group;
+/*
+ * Mask of CPUs for writing CBM values. We only need one CPU per-socket.
+ */
+static cpumask_t rdt_cpumask;
+
+#define rdt_for_each_child(pos_css, parent_ir) \
+ css_for_each_child((pos_css), &(parent_ir)->css)

static inline void clos_get(unsigned int closid)
{
@@ -116,11 +123,189 @@ static void intel_rdt_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&rdt_group_mutex);
}

+static int intel_cache_alloc_cbm_read(struct seq_file *m, void *v)
+{
+ struct intel_rdt *ir = css_rdt(seq_css(m));
+
+ seq_printf(m, "%08lx\n", ccmap[ir->clos].cache_mask);
+
+ return 0;
+}
+
+static inline bool cbm_is_contiguous(unsigned long var)
+{
+ unsigned long maxcbm = MAX_CBM_LENGTH;
+ unsigned long first_bit, zero_bit;
+
+ if (!var)
+ return false;
+
+ first_bit = find_next_bit(&var, maxcbm, 0);
+ zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
+
+ if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
+ return false;
+
+ return true;
+}
+
+static int cbm_validate(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+ struct cgroup_subsys_state *css;
+ struct intel_rdt *par, *c;
+ unsigned long *cbm_tmp;
+ int err = 0;
+
+ if (!cbm_is_contiguous(cbmvalue)) {
+ pr_err("bitmask should have >= 1 bit and be contiguous\n");
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ par = parent_rdt(ir);
+ cbm_tmp = &ccmap[par->clos].cache_mask;
+ if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rcu_read_lock();
+ rdt_for_each_child(css, ir) {
+ c = css_rdt(css);
+ cbm_tmp = &ccmap[c->clos].cache_mask;
+ if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
+ rcu_read_unlock();
+ pr_err("Children's mask not a subset\n");
+ err = -EINVAL;
+ goto out_err;
+ }
+ }
+ rcu_read_unlock();
+out_err:
+
+ return err;
+}
+
+static bool cbm_search(unsigned long cbm, int *closid)
+{
+ int maxid = boot_cpu_data.x86_rdt_max_closid;
+ unsigned int i;
+
+ for (i = 0; i < maxid; i++) {
+ if (bitmap_equal(&cbm, &ccmap[i].cache_mask, MAX_CBM_LENGTH)) {
+ *closid = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void closcbm_map_dump(void)
+{
+ int i;
+
+ pr_debug("CBMMAP\n");
+ for (i = 0; i < boot_cpu_data.x86_rdt_max_closid; i++) {
+ pr_debug("cache_mask: 0x%x,clos_refcnt: %u\n",
+ (unsigned int)ccmap[i].cache_mask, ccmap[i].clos_refcnt);
+ }
+}
+
+static void cbm_cpu_update(void *info)
+{
+ unsigned int closid = (unsigned int) info;
+
+ wrmsrl(CBM_FROM_INDEX(closid), ccmap[closid].cache_mask);
+}
+
+/*
+ * cbm_update_all() - Update the cache bit mask for all packages.
+ */
+static inline void cbm_update_all(unsigned int closid)
+{
+ on_each_cpu_mask(&rdt_cpumask, cbm_cpu_update, (void *)closid, 1);
+}
+
+/*
+ * intel_cache_alloc_cbm_write() - Validates and writes the
+ * cache bit mask(cbm) to the IA32_L3_MASK_n
+ * and also store the same in the ccmap.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * This helps to use the scant CLOSids optimally. This also
+ * implies that at context switch write to PQR-MSR is done
+ * only when a task with a different bitmask is scheduled in.
+ */
+static int intel_cache_alloc_cbm_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 cbmvalue)
+{
+ u32 max_cbm = boot_cpu_data.x86_rdt_max_cbm_len;
+ struct intel_rdt *ir = css_rdt(css);
+ unsigned int closid;
+ ssize_t err = 0;
+ u64 max_mask;
+
+ if (ir == &rdt_root_group)
+ return -EPERM;
+
+ /*
+ * Need global mutex as cbm write may allocate a closid.
+ */
+ mutex_lock(&rdt_group_mutex);
+
+ max_mask = (1ULL << max_cbm) - 1;
+ if (cbmvalue & ~max_mask) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (cbmvalue == ccmap[ir->clos].cache_mask)
+ goto out;
+
+ err = cbm_validate(ir, cbmvalue);
+ if (err)
+ goto out;
+
+ /*
+ * Try to get a reference for a different CLOSid and release the
+ * reference to the current CLOSid.
+ */
+ if (cbm_search(cbmvalue, &closid)) {
+ clos_put(ir->clos);
+ ir->clos = closid;
+ clos_get(closid);
+ } else {
+ closid = ir->clos;
+ err = clos_alloc(ir);
+ if (err)
+ goto out;
+
+ clos_put(closid);
+ ccmap[ir->clos].cache_mask = cbmvalue;
+ cbm_update_all(ir->clos);
+ }
+ closcbm_map_dump();
+out:
+ mutex_unlock(&rdt_group_mutex);
+
+ return err;
+}
+
+static inline void rdt_cpumask_update(int cpu)
+{
+ cpumask_t tmp;
+
+ cpumask_and(&tmp, &rdt_cpumask, topology_core_cpumask(cpu));
+ if (cpumask_empty(&tmp))
+ cpumask_set_cpu(cpu, &rdt_cpumask);
+}
+
static int __init intel_rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ int maxid, max_cbm_len, err = 0, i;
static struct clos_cbm_map *ccm;
- int maxid, max_cbm_len, err = 0;
size_t sizeb;

if (!cpu_has(c, X86_FEATURE_CAT_L3)) {
@@ -151,6 +336,9 @@ static int __init intel_rdt_late_init(void)
ccm->cache_mask = (1ULL << max_cbm_len) - 1;
ccm->clos_refcnt = 1;

+ for_each_online_cpu(i)
+ rdt_cpumask_update(i);
+
pr_info("Intel cache allocation enabled\n");
out_err:

@@ -159,8 +347,19 @@ out_err:

late_initcall(intel_rdt_late_init);

+static struct cftype rdt_files[] = {
+ {
+ .name = "cache_mask",
+ .seq_show = intel_cache_alloc_cbm_read,
+ .write_u64 = intel_cache_alloc_cbm_write,
+ .mode = 0666,
+ },
+ { } /* terminate */
+};
+
struct cgroup_subsys intel_rdt_cgrp_subsys = {
.css_alloc = intel_rdt_css_alloc,
.css_free = intel_rdt_css_free,
+ .legacy_cftypes = rdt_files,
.early_init = 0,
};
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/