Re: [PATCH V15 11/11] x86,cgroup/intel_rdt : Add a cgroup interface to manage Intel cache allocation

From: Marcelo Tosatti
Date: Wed Nov 18 2015 - 16:00:41 EST


On Thu, Oct 01, 2015 at 11:09:45PM -0700, Fenghua Yu wrote:
> Add a new cgroup 'intel_rdt' to manage cache allocation. Each cgroup
> directory is associated with a class of service id(closid). To map a
> task with closid during scheduling, this patch removes the closid field
> from task_struct and uses the already existing 'cgroups' field in
> task_struct.
>
> The cgroup has a file 'l3_cbm' which represents the L3 cache capacity
> bitmask(CBM). The CBM is global for the whole system currently. The
> capacity bitmask needs to have only contiguous bits set and number of
> bits that can be set is less than the max bits that can be set. The
> tasks belonging to a cgroup get to fill in the L3 cache represented by
> the capacity bitmask of the cgroup. For ex: if the max bits in the CBM
> is 10 and the cache size is 10MB, each bit represents 1MB of cache
> capacity.
>
> Root cgroup always has all the bits set in the l3_cbm. User can create
> more cgroups with mkdir syscall. By default the child cgroups inherit
> the capacity bitmask(CBM) from parent. User can change the CBM specified
> in hex for each cgroup. Each unique bitmask is associated with a class
> of service ID and an -ENOSPC is returned once we run out of
> closids.
>
> Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
> Signed-off-by: Fenghua Yu <fenghua.yu@xxxxxxxxx>
> ---
> arch/x86/include/asm/intel_rdt.h | 37 +++++++-
> arch/x86/kernel/cpu/intel_rdt.c | 194 +++++++++++++++++++++++++++++++++++++--
> include/linux/cgroup_subsys.h | 4 +
> include/linux/sched.h | 3 -
> init/Kconfig | 4 +-
> 5 files changed, 229 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
> index afb6da3..fbe1e00 100644
> --- a/arch/x86/include/asm/intel_rdt.h
> +++ b/arch/x86/include/asm/intel_rdt.h
> @@ -3,6 +3,7 @@
>
> #ifdef CONFIG_INTEL_RDT
>
> +#include <linux/cgroup.h>
> #include <linux/jump_label.h>
>
> #define MAX_CBM_LENGTH 32
> @@ -12,20 +13,54 @@
> extern struct static_key rdt_enable_key;
> void __intel_rdt_sched_in(void *dummy);
>
> +struct intel_rdt {
> + struct cgroup_subsys_state css;
> + u32 closid;
> +};
> +
> struct clos_cbm_table {
> unsigned long l3_cbm;
> unsigned int clos_refcnt;
> };
>
> /*
> + * Return rdt group corresponding to this container.
> + */
> +static inline struct intel_rdt *css_rdt(struct cgroup_subsys_state *css)
> +{
> + return css ? container_of(css, struct intel_rdt, css) : NULL;
> +}
> +
> +static inline struct intel_rdt *parent_rdt(struct intel_rdt *ir)
> +{
> + return css_rdt(ir->css.parent);
> +}
> +
> +/*
> + * Return rdt group to which this task belongs.
> + */
> +static inline struct intel_rdt *task_rdt(struct task_struct *task)
> +{
> + return css_rdt(task_css(task, intel_rdt_cgrp_id));
> +}
> +
> +/*
> * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
> *
> * Following considerations are made so that this has minimal impact
> * on scheduler hot path:
> * - This will stay as no-op unless we are running on an Intel SKU
> * which supports L3 cache allocation.
> + * - When support is present and enabled, does not do any
> + * IA32_PQR_MSR writes until the user starts really using the feature
> + * ie creates a rdt cgroup directory and assigns a cache_mask thats
> + * different from the root cgroup's cache_mask.
> * - Caches the per cpu CLOSid values and does the MSR write only
> - * when a task with a different CLOSid is scheduled in.
> + * when a task with a different CLOSid is scheduled in. That
> + * means the task belongs to a different cgroup.
> + * - Closids are allocated so that different cgroup directories
> + * with same cache_mask gets the same CLOSid. This minimizes CLOSids
> + * used and reduces MSR write frequency.
> */
> static inline void intel_rdt_sched_in(void)
> {
> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
> index ecaf8e6..cb4d2ef 100644
> --- a/arch/x86/kernel/cpu/intel_rdt.c
> +++ b/arch/x86/kernel/cpu/intel_rdt.c
> @@ -53,6 +53,10 @@ static cpumask_t tmp_cpumask;
> static DEFINE_MUTEX(rdt_group_mutex);
> struct static_key __read_mostly rdt_enable_key = STATIC_KEY_INIT_FALSE;
>
> +static struct intel_rdt rdt_root_group;
> +#define rdt_for_each_child(pos_css, parent_ir) \
> + css_for_each_child((pos_css), &(parent_ir)->css)
> +
> struct rdt_remote_data {
> int msr;
> u64 val;
> @@ -108,17 +112,16 @@ static inline bool cache_alloc_supported(struct cpuinfo_x86 *c)
> return false;
> }
>
> -
> void __intel_rdt_sched_in(void *dummy)
> {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> - u32 closid = current->closid;
> + struct intel_rdt *ir = task_rdt(current);
>
> - if (closid == state->closid)
> + if (ir->closid == state->closid)
> return;
>
> - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
> - state->closid = closid;
> + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, ir->closid);

What if another CPU runs

intel_cache_alloc_cbm_write()
if (cbm_search(cbmvalue, &closid)) {
ir->closid = closid;

Here? Probably a spinlock is necessary.

> + state->closid = ir->closid;
> }
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/