Re: [PATCH v2 22/33] x86/intel_rdt.c: Extend RDT to per cache and per resources
From: Thomas Gleixner
Date: Thu Sep 08 2016 - 11:00:46 EST
On Thu, 8 Sep 2016, Fenghua Yu wrote:
> +#define __DCBM_TABLE_INDEX(x) (x << 1)
> +#define __ICBM_TABLE_INDEX(x) ((x << 1) + 1)
This macro mess is completely undocumented.
> +inline int get_dcbm_table_index(int x)
static inline ???
> +{
> + return DCBM_TABLE_INDEX(x);
> +}
> +
> +inline int get_icbm_table_index(int x)
> +{
> + return ICBM_TABLE_INDEX(x);
> +}
Why are you introducing these when they are not used at all?
> /*
> * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
> * as it does not have CPUID enumeration support for Cache allocation.
> @@ -98,41 +123,141 @@ static inline bool cache_alloc_hsw_probe(void)
>
> wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_old);
>
> - boot_cpu_data.x86_cache_max_closid = 4;
> - boot_cpu_data.x86_cache_max_cbm_len = 20;
> + boot_cpu_data.x86_l3_max_closid = 4;
> + boot_cpu_data.x86_l3_max_cbm_len = 20;
So if you actually change the name of the cpudata struct member, then this
would make sense to be split out into a seperate patch. But hell, the patch
order of this stuff is an unholy mess anyway.
> min_bitmask_len = 2;
>
> return true;
> }
>
> +u32 max_cbm_len(int level)
> +{
> + switch (level) {
> + case CACHE_LEVEL3:
> + return boot_cpu_data.x86_l3_max_cbm_len;
> + default:
> + break;
> + }
> +
> + return (u32)~0;
> +}
> +u64 max_cbm(int level)
More functions without users and of course without a proper prefix for
public consumption. Documentation is not available either.
> +{
> + switch (level) {
> + case CACHE_LEVEL3:
> + return (1ULL << boot_cpu_data.x86_l3_max_cbm_len) - 1;
So the above max_cmb_len() returns:
cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
c->x86_l3_max_cbm_len = eax + 1;
i.e the content of leaf 10:1 EAX plus 1.
According to the SDM:
EAX 4:0: Length of the capacity bit mask for the corresponding ResID.
So first of all. Why is there a "+ 1" ? Again, that's not documented in the
cpuid initialization code at all.
So now you take that magic number and return:
(2 ^ magic) - 1
Cute. To be honest I'm too lazy to read through the SDM and figure it out
myself. This kind of stuff belongs into the code as comments. I seriously
doubt that the function names match the actual meaning.
> + default:
> + break;
> + }
> +
> + return (u64)~0;
What kind of return code is this ?
> +static inline bool cat_l3_supported(struct cpuinfo_x86 *c)
> +{
> + if (cpu_has(c, X86_FEATURE_CAT_L3))
> + return true;
> +
> + /*
> + * Probe for Haswell server CPUs.
> + */
> + if (c->x86 == 0x6 && c->x86_model == 0x3f)
> + return cache_alloc_hsw_probe();
Ah now we have an actual user for that haswell probe thingy ....
> + return false;
> +}
> +
> void __intel_rdt_sched_in(void *dummy)
I still have not found a reasonable explanation for this dummy argument.
> {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> + struct rdtgroup *rdtgrp;
> + int closid;
> + int domain;
Sigh.
> /*
> - * Currently closid is always 0. When user interface is added,
> - * closid will come from user interface.
> + * If this task is assigned to an rdtgroup, use it.
> + * Else use the group assigned to this cpu.
> */
> - if (state->closid == 0)
> + rdtgrp = current->rdtgroup;
> + if (!rdtgrp)
> + rdtgrp = this_cpu_read(cpu_rdtgroup);
This makes actually sense! Thanks for listening!
> +
> + domain = this_cpu_read(cpu_shared_domain);
> + closid = rdtgrp->resource.closid[domain];
> +
> + if (closid == state->closid)
> return;
>
> - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
> - state->closid = 0;
> + state->closid = closid;
> + /* Don't really write PQR register in simulation mode. */
> + if (unlikely(rdt_opts.simulate_cat_l3))
> + return;
> +
> + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
> }
>
> /*
> * When cdp mode is enabled, refcnt is maintained in the dcache_cbm entry.
> */
> -static inline void closid_get(u32 closid)
> +inline void closid_get(u32 closid, int domain)
s/static inline/inline/ What the heck?
Can you please explain what this is doing?
> {
> - struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)];
> -
> lockdep_assert_held(&rdtgroup_mutex);
>
> - cct->clos_refcnt++;
> + if (cat_l3_enabled) {
> + int l3_domain;
> + int dindex;
int l3_domain, dindex;
> + l3_domain = shared_domain[domain].l3_domain;
> + dindex = DCBM_TABLE_INDEX(closid);
> + l3_cctable[l3_domain][dindex].clos_refcnt++;
> + if (cdp_enabled) {
> + int iindex = ICBM_TABLE_INDEX(closid);
And if you call that variable 'index' instead of 'dindex' then you don't
need an extra one 'iindex'.
> +
> + l3_cctable[l3_domain][iindex].clos_refcnt++;
So now you have a seperate refcount for the Icache part, but the comment
above the function still says otherwise.
> + }
> + }
> }
>
> -static int closid_alloc(u32 *closid)
> +int closid_alloc(u32 *closid, int domain)
> {
> u32 maxid;
> u32 id;
> @@ -140,105 +265,215 @@ static int closid_alloc(u32 *closid)
> lockdep_assert_held(&rdtgroup_mutex);
>
> maxid = cconfig.max_closid;
> - id = find_first_zero_bit(cconfig.closmap, maxid);
> + id = find_first_zero_bit((unsigned long *)cconfig.closmap[domain],
Why do you need a typecast here? Get your damned structs straight.
> + maxid);
> +
> if (id == maxid)
> return -ENOSPC;
>
> - set_bit(id, cconfig.closmap);
> - closid_get(id);
> + set_bit(id, (unsigned long *)cconfig.closmap[domain]);
> + closid_get(id, domain);
> *closid = id;
> - cconfig.closids_used++;
>
> return 0;
> }
>
> -static inline void closid_free(u32 closid)
> +unsigned int get_domain_num(int level)
> {
> - clear_bit(closid, cconfig.closmap);
> - cctable[DCBM_TABLE_INDEX(closid)].cbm = 0;
> -
> - if (WARN_ON(!cconfig.closids_used))
> - return;
> + if (level == CACHE_LEVEL3)
> + return cpumask_weight(&rdt_l3_cpumask);
get_domain_num(level) suggests to me that it returns the domain number
corresponding to the level, but it actually returns the number of bits set
in the rdt_l3_cpumask. Very intuitive - NOT!
> + else
> + return -EINVAL;
Proper return value for a function returning 'unsigned int' ....
> +}
>
> - cconfig.closids_used--;
> +int level_to_leaf(int level)
> +{
> + switch (level) {
> + case CACHE_LEVEL3:
> + return 3;
> + default:
> + return -EINVAL;
> + }
> }
>
> -static void closid_put(u32 closid)
> +void closid_free(u32 closid, int domain, int level)
> {
> - struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)];
> + struct clos_cbm_table **cctable;
> + int leaf;
> + struct cpumask *mask;
> + int cpu;
> +
> + if (level == CACHE_LEVEL3)
> + cctable = l3_cctable;
Oh well. Why needs this assignment to happen here?
> +
> + clear_bit(closid, (unsigned long *)cconfig.closmap[domain]);
> +
> + if (level == CACHE_LEVEL3) {
And not here where it actually makes sense?
> + cctable[domain][closid].cbm = max_cbm(level);
> + leaf = level_to_leaf(level);
> + mask = &cache_domains[leaf].shared_cpu_map[domain];
> + cpu = cpumask_first(mask);
> + smp_call_function_single(cpu, cbm_update_l3_msr, &closid, 1);
A comment explaining that this must be done on one of the cpus in @domain
would be too helpful.
> + }
> +}
>
> +static void _closid_put(u32 closid, struct clos_cbm_table *cct,
Please use two underscores so it's obvious.
> + int domain, int level)
> +{
> lockdep_assert_held(&rdtgroup_mutex);
> if (WARN_ON(!cct->clos_refcnt))
> return;
>
> if (!--cct->clos_refcnt)
> - closid_free(closid);
> + closid_free(closid, domain, level);
> }
>
> -static void msr_cpu_update(void *arg)
> +void closid_put(u32 closid, int domain)
> +{
> + struct clos_cbm_table *cct;
> +
> + if (cat_l3_enabled) {
> + int l3_domain = shared_domain[domain].l3_domain;
> +
> + cct = &l3_cctable[l3_domain][DCBM_TABLE_INDEX(closid)];
> + _closid_put(closid, cct, l3_domain, CACHE_LEVEL3);
> + if (cdp_enabled) {
> + cct = &l3_cctable[l3_domain][ICBM_TABLE_INDEX(closid)];
> + _closid_put(closid, cct, l3_domain, CACHE_LEVEL3);
> + }
> + }
> +}
> +
> +void msr_cpu_update(void *arg)
> {
> struct rdt_remote_data *info = arg;
>
> + if (unlikely(rdt_opts.verbose))
> + pr_info("Write %lx to msr %x on cpu%d\n",
> + (unsigned long)info->val, info->msr,
> + smp_processor_id());
That's what DYNAMIC_DEBUG is for.
> +
> + if (unlikely(rdt_opts.simulate_cat_l3))
> + return;
Do we really need all this simulation stuff?
> +
> wrmsrl(info->msr, info->val);
> }
>
> +static struct cpumask *rdt_cache_cpumask(int level)
> +{
> + return &rdt_l3_cpumask;
That's a really useful helper .... Your choice of checking 'level' five
times in a row in various helpers versus returning l3 unconditionally is at
least interesting.
> +int get_cache_leaf(int level, int cpu)
> +{
> + int index;
> + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
this_cpu_ci is a complete misnomer as it suggests that it's actually the
cacheinfo for 'this cpu', i.e. the cpu on which the code is executing.
> + struct cacheinfo *this_leaf;
> + int num_leaves = this_cpu_ci->num_leaves;
> +
> + for (index = 0; index < num_leaves; index++) {
> + this_leaf = this_cpu_ci->info_list + index;
> + if (this_leaf->level == level)
> + return index;
The function is misnomed as well. It does not return the cache leaf, it
returns the leaf index .....
> + }
Why do you have a cacheinfo related function in this RDT code?
> +
> + return -EINVAL;
> +}
> +
> +static struct cpumask *get_shared_cpu_map(int cpu, int level)
> +{
> + int index;
> + struct cacheinfo *leaf;
> + struct cpu_cacheinfo *cpu_ci = get_cpu_cacheinfo(cpu);
> +
> + index = get_cache_leaf(level, cpu);
> + if (index < 0)
> + return 0;
> +
> + leaf = cpu_ci->info_list + index;
While here you actually get the leaf.
> +
> + return &leaf->shared_cpu_map;
> }
> static int clear_rdtgroup_cpumask(unsigned int cpu)
> @@ -293,63 +531,270 @@ static int clear_rdtgroup_cpumask(unsigned int cpu)
>
> static int intel_rdt_offline_cpu(unsigned int cpu)
> {
> - int i;
> + struct cpumask *shared_cpu_map;
> + int new_cpu;
> + int l3_domain;
> + int level;
> + int leaf;
Sigh. 1/3 of the line space is wasted for single variable declarations.
> mutex_lock(&rdtgroup_mutex);
> - if (!cpumask_test_and_clear_cpu(cpu, &rdt_cpumask)) {
> - mutex_unlock(&rdtgroup_mutex);
> - return;
> - }
>
> - cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask);
> - cpumask_clear_cpu(cpu, &tmp_cpumask);
> - i = cpumask_any(&tmp_cpumask);
> + level = CACHE_LEVEL3;
I have a hard time to understand the value of that 'level' variable, but
well that's the least of my worries with that code.
> +
> + l3_domain = per_cpu(cpu_l3_domain, cpu);
> + leaf = level_to_leaf(level);
> + shared_cpu_map = &cache_domains[leaf].shared_cpu_map[l3_domain];
>
> - if (i < nr_cpu_ids)
> - cpumask_set_cpu(i, &rdt_cpumask);
> + cpumask_clear_cpu(cpu, &rdt_l3_cpumask);
> + cpumask_clear_cpu(cpu, shared_cpu_map);
> + if (cpumask_empty(shared_cpu_map))
> + goto out;
So what clears @cpu in the rdtgroup cpumask?
> +
> + new_cpu = cpumask_first(shared_cpu_map);
> + rdt_cpumask_update(&rdt_l3_cpumask, new_cpu, level);
>
> clear_rdtgroup_cpumask(cpu);
Can you please use a consistent prefix and naming scheme?
rdt_cpumask_update()
clear_rdtgroup_cpumask()
WTF?
> +out:
> mutex_unlock(&rdtgroup_mutex);
> + return 0;
> +}
> +
> +/*
> + * Initialize per-cpu cpu_l3_domain.
> + *
> + * cpu_l3_domain numbers are consequtive integer starting from 0.
> + * Sets up 1:1 mapping of cpu id and cpu_l3_domain.
> + */
> +static int __init cpu_cache_domain_init(int level)
> +{
> + int i, j;
> + int max_cpu_cache_domain = 0;
> + int index;
> + struct cacheinfo *leaf;
> + int *domain;
> + struct cpu_cacheinfo *cpu_ci;
Eyes hurt.
> +
> + for_each_online_cpu(i) {
> + domain = &per_cpu(cpu_l3_domain, i);
per_cpu_ptr() exists for a reason.
> + if (*domain == -1) {
> + index = get_cache_leaf(level, i);
> + if (index < 0)
> + return -EINVAL;
> +
> + cpu_ci = get_cpu_cacheinfo(i);
> + leaf = cpu_ci->info_list + index;
> + if (cpumask_empty(&leaf->shared_cpu_map)) {
> + WARN(1, "no shared cpu for L2\n");
So L2 is always the right thing for every value of @level? And what the
heck is the value of that WARN? Nothing because the callchain is already
known. What's worse is that you don't tell about @level, @index, @i (which
should be named @cpu).
> + return -EINVAL;
> + }
> +
> + for_each_cpu(j, &leaf->shared_cpu_map) {
So again independent of @level you fiddle with cpu_l3_domain. Interesting.
> + domain = &per_cpu(cpu_l3_domain, j);
> + *domain = max_cpu_cache_domain;
> + }
> + max_cpu_cache_domain++;
what's the actual meaning of max_cpu_cache_domain?
> + }
> + }
> +
> + return 0;
> +}
> +
> +static int __init rdt_setup(char *str)
> +{
> + char *tok;
> +
> + while ((tok = strsep(&str, ",")) != NULL) {
> + if (!*tok)
> + return -EINVAL;
> +
> + if (strcmp(tok, "simulate_cat_l3") == 0) {
> + pr_info("Simulate CAT L3\n");
> + rdt_opts.simulate_cat_l3 = true;
So this goes into rdt_opts
> + } else if (strcmp(tok, "disable_cat_l3") == 0) {
> + pr_info("CAT L3 is disabled\n");
> + disable_cat_l3 = true;
While this is a distinct control variable. Very consistent.
> + } else {
> + pr_info("Invalid rdt option\n");
Very helpful w/o printing the actual option ....
> + return -EINVAL;
> + }
> + }
> +
> + return 0;
> +}
> +__setup("resctrl=", rdt_setup);
> +
> +static inline bool resource_alloc_enabled(void)
> +{
> + return cat_l3_enabled;
> +}
Oh well.
> +
> +static int shared_domain_init(void)
> +{
> + int l3_domain_num = get_domain_num(CACHE_LEVEL3);
> + int size;
> + int domain;
> + struct cpumask *cpumask;
> + struct cpumask *shared_cpu_map;
> + int cpu;
More random variable declarations.
> + if (cat_l3_enabled) {
> + shared_domain_num = l3_domain_num;
> + cpumask = &rdt_l3_cpumask;
> + } else
> + return -EINVAL;
Missing curly braces.
> +
> + size = shared_domain_num * sizeof(struct shared_domain);
> + shared_domain = kzalloc(size, GFP_KERNEL);
> + if (!shared_domain)
> + return -EINVAL;
> +
> + domain = 0;
> + for_each_cpu(cpu, cpumask) {
> + if (cat_l3_enabled)
> + shared_domain[domain].l3_domain =
> + per_cpu(cpu_l3_domain, cpu);
> + else
> + shared_domain[domain].l3_domain = -1;
> +
> + shared_cpu_map = get_shared_cpu_map(cpu, CACHE_LEVEL3);
> +
> + cpumask_copy(&shared_domain[domain].cpumask, shared_cpu_map);
What's the point of updating the cpumask when the thing is disabled? If
there is a reason then this should be documented in a comment.
> + domain++;
> + }
> + for_each_online_cpu(cpu) {
> + if (cat_l3_enabled)
> + per_cpu(cpu_shared_domain, cpu) =
> + per_cpu(cpu_l3_domain, cpu);
More missing curly braces. And using an intermediate variable would remove
this hard to read line breaks.
> + }
> +
> + return 0;
> +}
> +
> +static int cconfig_init(int maxid)
> +{
> + int num;
> + int domain;
> + unsigned long *closmap_block;
> + int maxid_size;
> +
> + maxid_size = BITS_TO_LONGS(maxid);
> + num = maxid_size * shared_domain_num;
> + cconfig.closmap = kcalloc(maxid, sizeof(unsigned long *), GFP_KERNEL);
Really intuitive. You calc num right before allocating the closmap pointers
and then you use it in the next alloc.
> + if (!cconfig.closmap)
> + goto out_free;
> +
> + closmap_block = kcalloc(num, sizeof(unsigned long), GFP_KERNEL);
> + if (!closmap_block)
> + goto out_free;
> +
> + for (domain = 0; domain < shared_domain_num; domain++)
> + cconfig.closmap[domain] = (unsigned long *)closmap_block +
More random type casting.
> + domain * maxid_size;
Why don't you allocate that whole mess in one go?
unsigned int ptrsize, mapsize, size, d;
void *p;
ptrsize = maxid * sizeof(unsigned long *);
mapsize = BITS_TO_LONGS(maxid) * sizeof(unsigned long);
size = ptrsize + num_shared_domains * mapsize;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
cconfig.closmap = p;
cconfig.max_closid = maxid;
p += ptrsize;
for (d = 0; d < num_shared_domains; d++, p += mapsize)
cconfig.closmap[d] = p;
return 0;
Would be too simple. Once more.
> +
> + cconfig.max_closid = maxid;
> +
> + return 0;
> +out_free:
> + kfree(cconfig.closmap);
> + kfree(closmap_block);
> + return -ENOMEM;
> +}
> +
> +static int __init cat_cache_init(int level, int maxid,
> + struct clos_cbm_table ***cctable)
> +{
> + int domain_num;
> + int domain;
> + int size;
> + int ret = 0;
> + struct clos_cbm_table *p;
> +
> + domain_num = get_domain_num(level);
> + size = domain_num * sizeof(struct clos_cbm_table *);
> + *cctable = kzalloc(size, GFP_KERNEL);
> + if (!*cctable) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + size = maxid * domain_num * sizeof(struct clos_cbm_table);
> + p = kzalloc(size, GFP_KERNEL);
> + if (!p) {
> + kfree(*cctable);
> + ret = -ENOMEM;
> + goto out;
> + }
> + for (domain = 0; domain < domain_num; domain++)
> + (*cctable)[domain] = p + domain * maxid;
Same crap.
> +
> + ret = cpu_cache_domain_init(level);
> + if (ret) {
> + kfree(*cctable);
> + kfree(p);
> + }
> +out:
> + return ret;
> }
>
> static int __init intel_rdt_late_init(void)
> {
> struct cpuinfo_x86 *c = &boot_cpu_data;
> u32 maxid;
> - int err = 0, size, i;
> -
> - maxid = c->x86_cache_max_closid;
> -
> - size = maxid * sizeof(struct clos_cbm_table);
> - cctable = kzalloc(size, GFP_KERNEL);
> - if (!cctable) {
> - err = -ENOMEM;
> - goto out_err;
> + int i;
> + int ret;
> +
> + if (unlikely(disable_cat_l3))
This inlikely is completely pointless. This is not a hotpath function. It
just makes the code harder to read.
> + cat_l3_enabled = false;
> + else if (cat_l3_supported(c))
> + cat_l3_enabled = true;
> + else if (rdt_opts.simulate_cat_l3 &&
> + get_cache_leaf(CACHE_LEVEL3, 0) >= 0)
> + cat_l3_enabled = true;
> + else
> + cat_l3_enabled = false;
Please move that into resource_alloc_enabled() and make it readable.
> + if (!resource_alloc_enabled())
> + return -ENODEV;
> +
> + if (rdt_opts.simulate_cat_l3) {
> + boot_cpu_data.x86_l3_max_closid = 16;
> + boot_cpu_data.x86_l3_max_cbm_len = 20;
> + }
> + for_each_online_cpu(i) {
> + rdt_cpumask_update(&rdt_l3_cpumask, i, CACHE_LEVEL3);
> }
>
> - size = BITS_TO_LONGS(maxid) * sizeof(long);
> - cconfig.closmap = kzalloc(size, GFP_KERNEL);
> - if (!cconfig.closmap) {
> - kfree(cctable);
> - err = -ENOMEM;
> - goto out_err;
> + maxid = 0;
> + if (cat_l3_enabled) {
> + maxid = boot_cpu_data.x86_l3_max_closid;
> + ret = cat_cache_init(CACHE_LEVEL3, maxid, &l3_cctable);
> + if (ret)
> + cat_l3_enabled = false;
> }
>
> - for_each_online_cpu(i)
> - rdt_cpumask_update(i);
> + if (!cat_l3_enabled)
> + return -ENOSPC;
Huch? How do you get here when cat_l3_enabled is false?
> +
> + ret = shared_domain_init();
> + if (ret)
> + return -ENODEV;
Leaks closmaps
> +
> + ret = cconfig_init(maxid);
> + if (ret)
> + return ret;
Leaks more stuff.
> ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> "AP_INTEL_RDT_ONLINE",
> intel_rdt_online_cpu, intel_rdt_offline_cpu);
I still have not figured out how all that init scheme is working so that
you can use nocalls() for the hotplug registration.
> - if (err < 0)
> - goto out_err;
> + if (ret < 0)
> + return ret;
And this as well.
Thanks,
tglx