Re: [PATCH v2 22/33] x86/intel_rdt.c: Extend RDT to per cache and per resources

From: Thomas Gleixner
Date: Thu Sep 08 2016 - 11:00:46 EST


On Thu, 8 Sep 2016, Fenghua Yu wrote:

> +#define __DCBM_TABLE_INDEX(x) (x << 1)
> +#define __ICBM_TABLE_INDEX(x) ((x << 1) + 1)

This macro mess is completely undocumented.

> +inline int get_dcbm_table_index(int x)

static inline ???

> +{
> + return DCBM_TABLE_INDEX(x);
> +}
> +
> +inline int get_icbm_table_index(int x)
> +{
> + return ICBM_TABLE_INDEX(x);
> +}

Why are you introducing these when they are not used at all?

> /*
> * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
> * as it does not have CPUID enumeration support for Cache allocation.
> @@ -98,41 +123,141 @@ static inline bool cache_alloc_hsw_probe(void)
>
> wrmsr_safe(MSR_IA32_PQR_ASSOC, l, h_old);
>
> - boot_cpu_data.x86_cache_max_closid = 4;
> - boot_cpu_data.x86_cache_max_cbm_len = 20;
> + boot_cpu_data.x86_l3_max_closid = 4;
> + boot_cpu_data.x86_l3_max_cbm_len = 20;

So if you actually change the name of the cpudata struct member, then this
would make sense to be split out into a seperate patch. But hell, the patch
order of this stuff is an unholy mess anyway.

> min_bitmask_len = 2;
>
> return true;
> }
>
> +u32 max_cbm_len(int level)
> +{
> + switch (level) {
> + case CACHE_LEVEL3:
> + return boot_cpu_data.x86_l3_max_cbm_len;
> + default:
> + break;
> + }
> +
> + return (u32)~0;
> +}
> +u64 max_cbm(int level)

More functions without users and of course without a proper prefix for
public consumption. Documentation is not available either.

> +{
> + switch (level) {
> + case CACHE_LEVEL3:
> + return (1ULL << boot_cpu_data.x86_l3_max_cbm_len) - 1;

So the above max_cmb_len() returns:

cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
c->x86_l3_max_cbm_len = eax + 1;

i.e the content of leaf 10:1 EAX plus 1.

According to the SDM:

EAX 4:0: Length of the capacity bit mask for the corresponding ResID.

So first of all. Why is there a "+ 1" ? Again, that's not documented in the
cpuid initialization code at all.

So now you take that magic number and return:

(2 ^ magic) - 1

Cute. To be honest I'm too lazy to read through the SDM and figure it out
myself. This kind of stuff belongs into the code as comments. I seriously
doubt that the function names match the actual meaning.

> + default:
> + break;
> + }
> +
> + return (u64)~0;

What kind of return code is this ?

> +static inline bool cat_l3_supported(struct cpuinfo_x86 *c)
> +{
> + if (cpu_has(c, X86_FEATURE_CAT_L3))
> + return true;
> +
> + /*
> + * Probe for Haswell server CPUs.
> + */
> + if (c->x86 == 0x6 && c->x86_model == 0x3f)
> + return cache_alloc_hsw_probe();

Ah now we have an actual user for that haswell probe thingy ....

> + return false;
> +}
> +
> void __intel_rdt_sched_in(void *dummy)

I still have not found a reasonable explanation for this dummy argument.

> {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> + struct rdtgroup *rdtgrp;
> + int closid;
> + int domain;

Sigh.

> /*
> - * Currently closid is always 0. When user interface is added,
> - * closid will come from user interface.
> + * If this task is assigned to an rdtgroup, use it.
> + * Else use the group assigned to this cpu.
> */
> - if (state->closid == 0)
> + rdtgrp = current->rdtgroup;
> + if (!rdtgrp)
> + rdtgrp = this_cpu_read(cpu_rdtgroup);

This makes actually sense! Thanks for listening!

> +
> + domain = this_cpu_read(cpu_shared_domain);
> + closid = rdtgrp->resource.closid[domain];
> +
> + if (closid == state->closid)
> return;
>
> - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
> - state->closid = 0;
> + state->closid = closid;
> + /* Don't really write PQR register in simulation mode. */
> + if (unlikely(rdt_opts.simulate_cat_l3))
> + return;
> +
> + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
> }
>
> /*
> * When cdp mode is enabled, refcnt is maintained in the dcache_cbm entry.
> */
> -static inline void closid_get(u32 closid)
> +inline void closid_get(u32 closid, int domain)

s/static inline/inline/ What the heck?

Can you please explain what this is doing?

> {
> - struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)];
> -
> lockdep_assert_held(&rdtgroup_mutex);
>
> - cct->clos_refcnt++;
> + if (cat_l3_enabled) {
> + int l3_domain;
> + int dindex;

int l3_domain, dindex;

> + l3_domain = shared_domain[domain].l3_domain;
> + dindex = DCBM_TABLE_INDEX(closid);
> + l3_cctable[l3_domain][dindex].clos_refcnt++;
> + if (cdp_enabled) {
> + int iindex = ICBM_TABLE_INDEX(closid);

And if you call that variable 'index' instead of 'dindex' then you don't
need an extra one 'iindex'.

> +
> + l3_cctable[l3_domain][iindex].clos_refcnt++;

So now you have a seperate refcount for the Icache part, but the comment
above the function still says otherwise.

> + }
> + }
> }
>
> -static int closid_alloc(u32 *closid)
> +int closid_alloc(u32 *closid, int domain)
> {
> u32 maxid;
> u32 id;
> @@ -140,105 +265,215 @@ static int closid_alloc(u32 *closid)
> lockdep_assert_held(&rdtgroup_mutex);
>
> maxid = cconfig.max_closid;
> - id = find_first_zero_bit(cconfig.closmap, maxid);
> + id = find_first_zero_bit((unsigned long *)cconfig.closmap[domain],

Why do you need a typecast here? Get your damned structs straight.

> + maxid);
> +
> if (id == maxid)
> return -ENOSPC;
>
> - set_bit(id, cconfig.closmap);
> - closid_get(id);
> + set_bit(id, (unsigned long *)cconfig.closmap[domain]);
> + closid_get(id, domain);
> *closid = id;
> - cconfig.closids_used++;
>
> return 0;
> }
>
> -static inline void closid_free(u32 closid)
> +unsigned int get_domain_num(int level)
> {
> - clear_bit(closid, cconfig.closmap);
> - cctable[DCBM_TABLE_INDEX(closid)].cbm = 0;
> -
> - if (WARN_ON(!cconfig.closids_used))
> - return;
> + if (level == CACHE_LEVEL3)
> + return cpumask_weight(&rdt_l3_cpumask);

get_domain_num(level) suggests to me that it returns the domain number
corresponding to the level, but it actually returns the number of bits set
in the rdt_l3_cpumask. Very intuitive - NOT!

> + else
> + return -EINVAL;

Proper return value for a function returning 'unsigned int' ....

> +}
>
> - cconfig.closids_used--;
> +int level_to_leaf(int level)
> +{
> + switch (level) {
> + case CACHE_LEVEL3:
> + return 3;
> + default:
> + return -EINVAL;
> + }
> }
>
> -static void closid_put(u32 closid)
> +void closid_free(u32 closid, int domain, int level)
> {
> - struct clos_cbm_table *cct = &cctable[DCBM_TABLE_INDEX(closid)];
> + struct clos_cbm_table **cctable;
> + int leaf;
> + struct cpumask *mask;
> + int cpu;
> +
> + if (level == CACHE_LEVEL3)
> + cctable = l3_cctable;

Oh well. Why needs this assignment to happen here?

> +
> + clear_bit(closid, (unsigned long *)cconfig.closmap[domain]);
> +
> + if (level == CACHE_LEVEL3) {

And not here where it actually makes sense?

> + cctable[domain][closid].cbm = max_cbm(level);
> + leaf = level_to_leaf(level);
> + mask = &cache_domains[leaf].shared_cpu_map[domain];
> + cpu = cpumask_first(mask);
> + smp_call_function_single(cpu, cbm_update_l3_msr, &closid, 1);

A comment explaining that this must be done on one of the cpus in @domain
would be too helpful.

> + }
> +}
>
> +static void _closid_put(u32 closid, struct clos_cbm_table *cct,

Please use two underscores so it's obvious.

> + int domain, int level)
> +{
> lockdep_assert_held(&rdtgroup_mutex);
> if (WARN_ON(!cct->clos_refcnt))
> return;
>
> if (!--cct->clos_refcnt)
> - closid_free(closid);
> + closid_free(closid, domain, level);
> }
>
> -static void msr_cpu_update(void *arg)
> +void closid_put(u32 closid, int domain)
> +{
> + struct clos_cbm_table *cct;
> +
> + if (cat_l3_enabled) {
> + int l3_domain = shared_domain[domain].l3_domain;
> +
> + cct = &l3_cctable[l3_domain][DCBM_TABLE_INDEX(closid)];
> + _closid_put(closid, cct, l3_domain, CACHE_LEVEL3);
> + if (cdp_enabled) {
> + cct = &l3_cctable[l3_domain][ICBM_TABLE_INDEX(closid)];
> + _closid_put(closid, cct, l3_domain, CACHE_LEVEL3);
> + }
> + }
> +}
> +
> +void msr_cpu_update(void *arg)
> {
> struct rdt_remote_data *info = arg;
>
> + if (unlikely(rdt_opts.verbose))
> + pr_info("Write %lx to msr %x on cpu%d\n",
> + (unsigned long)info->val, info->msr,
> + smp_processor_id());

That's what DYNAMIC_DEBUG is for.

> +
> + if (unlikely(rdt_opts.simulate_cat_l3))
> + return;

Do we really need all this simulation stuff?

> +
> wrmsrl(info->msr, info->val);
> }
>
> +static struct cpumask *rdt_cache_cpumask(int level)
> +{
> + return &rdt_l3_cpumask;

That's a really useful helper .... Your choice of checking 'level' five
times in a row in various helpers versus returning l3 unconditionally is at
least interesting.

> +int get_cache_leaf(int level, int cpu)
> +{
> + int index;
> + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);

this_cpu_ci is a complete misnomer as it suggests that it's actually the
cacheinfo for 'this cpu', i.e. the cpu on which the code is executing.

> + struct cacheinfo *this_leaf;
> + int num_leaves = this_cpu_ci->num_leaves;
> +
> + for (index = 0; index < num_leaves; index++) {
> + this_leaf = this_cpu_ci->info_list + index;
> + if (this_leaf->level == level)
> + return index;

The function is misnomed as well. It does not return the cache leaf, it
returns the leaf index .....

> + }

Why do you have a cacheinfo related function in this RDT code?

> +
> + return -EINVAL;
> +}
> +
> +static struct cpumask *get_shared_cpu_map(int cpu, int level)
> +{
> + int index;
> + struct cacheinfo *leaf;
> + struct cpu_cacheinfo *cpu_ci = get_cpu_cacheinfo(cpu);
> +
> + index = get_cache_leaf(level, cpu);
> + if (index < 0)
> + return 0;
> +
> + leaf = cpu_ci->info_list + index;

While here you actually get the leaf.

> +
> + return &leaf->shared_cpu_map;
> }

> static int clear_rdtgroup_cpumask(unsigned int cpu)
> @@ -293,63 +531,270 @@ static int clear_rdtgroup_cpumask(unsigned int cpu)
>
> static int intel_rdt_offline_cpu(unsigned int cpu)
> {
> - int i;
> + struct cpumask *shared_cpu_map;
> + int new_cpu;
> + int l3_domain;
> + int level;
> + int leaf;

Sigh. 1/3 of the line space is wasted for single variable declarations.

> mutex_lock(&rdtgroup_mutex);
> - if (!cpumask_test_and_clear_cpu(cpu, &rdt_cpumask)) {
> - mutex_unlock(&rdtgroup_mutex);
> - return;
> - }
>
> - cpumask_and(&tmp_cpumask, topology_core_cpumask(cpu), cpu_online_mask);
> - cpumask_clear_cpu(cpu, &tmp_cpumask);
> - i = cpumask_any(&tmp_cpumask);
> + level = CACHE_LEVEL3;

I have a hard time to understand the value of that 'level' variable, but
well that's the least of my worries with that code.

> +
> + l3_domain = per_cpu(cpu_l3_domain, cpu);
> + leaf = level_to_leaf(level);
> + shared_cpu_map = &cache_domains[leaf].shared_cpu_map[l3_domain];
>
> - if (i < nr_cpu_ids)
> - cpumask_set_cpu(i, &rdt_cpumask);
> + cpumask_clear_cpu(cpu, &rdt_l3_cpumask);
> + cpumask_clear_cpu(cpu, shared_cpu_map);
> + if (cpumask_empty(shared_cpu_map))
> + goto out;

So what clears @cpu in the rdtgroup cpumask?

> +
> + new_cpu = cpumask_first(shared_cpu_map);
> + rdt_cpumask_update(&rdt_l3_cpumask, new_cpu, level);
>
> clear_rdtgroup_cpumask(cpu);

Can you please use a consistent prefix and naming scheme?

rdt_cpumask_update()
clear_rdtgroup_cpumask()

WTF?

> +out:
> mutex_unlock(&rdtgroup_mutex);
> + return 0;
> +}
> +
> +/*
> + * Initialize per-cpu cpu_l3_domain.
> + *
> + * cpu_l3_domain numbers are consequtive integer starting from 0.
> + * Sets up 1:1 mapping of cpu id and cpu_l3_domain.
> + */
> +static int __init cpu_cache_domain_init(int level)
> +{
> + int i, j;
> + int max_cpu_cache_domain = 0;
> + int index;
> + struct cacheinfo *leaf;
> + int *domain;
> + struct cpu_cacheinfo *cpu_ci;

Eyes hurt.

> +
> + for_each_online_cpu(i) {
> + domain = &per_cpu(cpu_l3_domain, i);

per_cpu_ptr() exists for a reason.

> + if (*domain == -1) {
> + index = get_cache_leaf(level, i);
> + if (index < 0)
> + return -EINVAL;
> +
> + cpu_ci = get_cpu_cacheinfo(i);
> + leaf = cpu_ci->info_list + index;
> + if (cpumask_empty(&leaf->shared_cpu_map)) {
> + WARN(1, "no shared cpu for L2\n");

So L2 is always the right thing for every value of @level? And what the
heck is the value of that WARN? Nothing because the callchain is already
known. What's worse is that you don't tell about @level, @index, @i (which
should be named @cpu).

> + return -EINVAL;
> + }
> +
> + for_each_cpu(j, &leaf->shared_cpu_map) {

So again independent of @level you fiddle with cpu_l3_domain. Interesting.

> + domain = &per_cpu(cpu_l3_domain, j);
> + *domain = max_cpu_cache_domain;
> + }
> + max_cpu_cache_domain++;

what's the actual meaning of max_cpu_cache_domain?

> + }
> + }
> +
> + return 0;
> +}
> +
> +static int __init rdt_setup(char *str)
> +{
> + char *tok;
> +
> + while ((tok = strsep(&str, ",")) != NULL) {
> + if (!*tok)
> + return -EINVAL;
> +
> + if (strcmp(tok, "simulate_cat_l3") == 0) {
> + pr_info("Simulate CAT L3\n");
> + rdt_opts.simulate_cat_l3 = true;

So this goes into rdt_opts

> + } else if (strcmp(tok, "disable_cat_l3") == 0) {
> + pr_info("CAT L3 is disabled\n");
> + disable_cat_l3 = true;

While this is a distinct control variable. Very consistent.

> + } else {
> + pr_info("Invalid rdt option\n");

Very helpful w/o printing the actual option ....

> + return -EINVAL;
> + }
> + }
> +
> + return 0;
> +}
> +__setup("resctrl=", rdt_setup);
> +
> +static inline bool resource_alloc_enabled(void)
> +{
> + return cat_l3_enabled;
> +}

Oh well.

> +
> +static int shared_domain_init(void)
> +{
> + int l3_domain_num = get_domain_num(CACHE_LEVEL3);
> + int size;
> + int domain;
> + struct cpumask *cpumask;
> + struct cpumask *shared_cpu_map;
> + int cpu;

More random variable declarations.

> + if (cat_l3_enabled) {
> + shared_domain_num = l3_domain_num;
> + cpumask = &rdt_l3_cpumask;
> + } else
> + return -EINVAL;

Missing curly braces.

> +
> + size = shared_domain_num * sizeof(struct shared_domain);
> + shared_domain = kzalloc(size, GFP_KERNEL);
> + if (!shared_domain)
> + return -EINVAL;
> +
> + domain = 0;
> + for_each_cpu(cpu, cpumask) {
> + if (cat_l3_enabled)
> + shared_domain[domain].l3_domain =
> + per_cpu(cpu_l3_domain, cpu);
> + else
> + shared_domain[domain].l3_domain = -1;
> +
> + shared_cpu_map = get_shared_cpu_map(cpu, CACHE_LEVEL3);
> +
> + cpumask_copy(&shared_domain[domain].cpumask, shared_cpu_map);

What's the point of updating the cpumask when the thing is disabled? If
there is a reason then this should be documented in a comment.

> + domain++;
> + }
> + for_each_online_cpu(cpu) {
> + if (cat_l3_enabled)
> + per_cpu(cpu_shared_domain, cpu) =
> + per_cpu(cpu_l3_domain, cpu);

More missing curly braces. And using an intermediate variable would remove
this hard to read line breaks.

> + }
> +
> + return 0;
> +}
> +
> +static int cconfig_init(int maxid)
> +{
> + int num;
> + int domain;
> + unsigned long *closmap_block;
> + int maxid_size;
> +
> + maxid_size = BITS_TO_LONGS(maxid);
> + num = maxid_size * shared_domain_num;
> + cconfig.closmap = kcalloc(maxid, sizeof(unsigned long *), GFP_KERNEL);

Really intuitive. You calc num right before allocating the closmap pointers
and then you use it in the next alloc.

> + if (!cconfig.closmap)
> + goto out_free;
> +
> + closmap_block = kcalloc(num, sizeof(unsigned long), GFP_KERNEL);
> + if (!closmap_block)
> + goto out_free;
> +
> + for (domain = 0; domain < shared_domain_num; domain++)
> + cconfig.closmap[domain] = (unsigned long *)closmap_block +

More random type casting.

> + domain * maxid_size;

Why don't you allocate that whole mess in one go?

unsigned int ptrsize, mapsize, size, d;
void *p;

ptrsize = maxid * sizeof(unsigned long *);
mapsize = BITS_TO_LONGS(maxid) * sizeof(unsigned long);
size = ptrsize + num_shared_domains * mapsize;

p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;

cconfig.closmap = p;
cconfig.max_closid = maxid;

p += ptrsize;
for (d = 0; d < num_shared_domains; d++, p += mapsize)
cconfig.closmap[d] = p;
return 0;

Would be too simple. Once more.

> +
> + cconfig.max_closid = maxid;
> +
> + return 0;
> +out_free:
> + kfree(cconfig.closmap);
> + kfree(closmap_block);
> + return -ENOMEM;
> +}
> +
> +static int __init cat_cache_init(int level, int maxid,
> + struct clos_cbm_table ***cctable)
> +{
> + int domain_num;
> + int domain;
> + int size;
> + int ret = 0;
> + struct clos_cbm_table *p;
> +
> + domain_num = get_domain_num(level);
> + size = domain_num * sizeof(struct clos_cbm_table *);
> + *cctable = kzalloc(size, GFP_KERNEL);
> + if (!*cctable) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + size = maxid * domain_num * sizeof(struct clos_cbm_table);
> + p = kzalloc(size, GFP_KERNEL);
> + if (!p) {
> + kfree(*cctable);
> + ret = -ENOMEM;
> + goto out;
> + }
> + for (domain = 0; domain < domain_num; domain++)
> + (*cctable)[domain] = p + domain * maxid;

Same crap.

> +
> + ret = cpu_cache_domain_init(level);
> + if (ret) {
> + kfree(*cctable);
> + kfree(p);
> + }
> +out:
> + return ret;
> }
>
> static int __init intel_rdt_late_init(void)
> {
> struct cpuinfo_x86 *c = &boot_cpu_data;
> u32 maxid;
> - int err = 0, size, i;
> -
> - maxid = c->x86_cache_max_closid;
> -
> - size = maxid * sizeof(struct clos_cbm_table);
> - cctable = kzalloc(size, GFP_KERNEL);
> - if (!cctable) {
> - err = -ENOMEM;
> - goto out_err;
> + int i;
> + int ret;
> +
> + if (unlikely(disable_cat_l3))

This inlikely is completely pointless. This is not a hotpath function. It
just makes the code harder to read.

> + cat_l3_enabled = false;
> + else if (cat_l3_supported(c))
> + cat_l3_enabled = true;
> + else if (rdt_opts.simulate_cat_l3 &&
> + get_cache_leaf(CACHE_LEVEL3, 0) >= 0)
> + cat_l3_enabled = true;
> + else
> + cat_l3_enabled = false;

Please move that into resource_alloc_enabled() and make it readable.

> + if (!resource_alloc_enabled())
> + return -ENODEV;
> +
> + if (rdt_opts.simulate_cat_l3) {
> + boot_cpu_data.x86_l3_max_closid = 16;
> + boot_cpu_data.x86_l3_max_cbm_len = 20;
> + }
> + for_each_online_cpu(i) {
> + rdt_cpumask_update(&rdt_l3_cpumask, i, CACHE_LEVEL3);
> }
>
> - size = BITS_TO_LONGS(maxid) * sizeof(long);
> - cconfig.closmap = kzalloc(size, GFP_KERNEL);
> - if (!cconfig.closmap) {
> - kfree(cctable);
> - err = -ENOMEM;
> - goto out_err;
> + maxid = 0;
> + if (cat_l3_enabled) {
> + maxid = boot_cpu_data.x86_l3_max_closid;
> + ret = cat_cache_init(CACHE_LEVEL3, maxid, &l3_cctable);
> + if (ret)
> + cat_l3_enabled = false;
> }
>
> - for_each_online_cpu(i)
> - rdt_cpumask_update(i);
> + if (!cat_l3_enabled)
> + return -ENOSPC;

Huch? How do you get here when cat_l3_enabled is false?

> +
> + ret = shared_domain_init();
> + if (ret)
> + return -ENODEV;

Leaks closmaps

> +
> + ret = cconfig_init(maxid);
> + if (ret)
> + return ret;

Leaks more stuff.

> ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> "AP_INTEL_RDT_ONLINE",
> intel_rdt_online_cpu, intel_rdt_offline_cpu);

I still have not figured out how all that init scheme is working so that
you can use nocalls() for the hotplug registration.

> - if (err < 0)
> - goto out_err;
> + if (ret < 0)
> + return ret;

And this as well.

Thanks,

tglx