Re: [PATCH v2 24/33] x86/intel_rdt_rdtgroup.c: Create info directory

From: Thomas Gleixner
Date: Thu Sep 08 2016 - 12:07:26 EST


On Thu, 8 Sep 2016, Fenghua Yu wrote:
> +/*
> + * kernfs_root - find out the kernfs_root a kernfs_node belongs to
> + * @kn: kernfs_node of interest
> + *
> + * Return the kernfs_root @kn belongs to.
> + */
> +static inline struct kernfs_root *get_kernfs_root(struct kernfs_node *kn)
> +{
> + if (kn->parent)
> + kn = kn->parent;

So this is guaranteed to have a single nesting?

> + return kn->dir.root;
> +}
> +
> +/*
> + * rdtgroup_file_mode - deduce file mode of a control file
> + * @cft: the control file in question
> + *
> + * S_IRUGO for read, S_IWUSR for write.
> + */
> +static umode_t rdtgroup_file_mode(const struct rftype *rft)
> +{
> + umode_t mode = 0;
> +
> + if (rft->read_u64 || rft->read_s64 || rft->seq_show)
> + mode |= S_IRUGO;
> +
> + if (rft->write_u64 || rft->write_s64 || rft->write)
> + mode |= S_IWUSR;

Why don't you store the mode in rtftype instead of evaluating it at
runtime?

Aside of that [read|write]_[s|u]64 are nowhere used in this whole patch
series, but take plenty of storage and line space for nothing.

> +static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
> + const struct rftype *end)
> +{
> + struct rftype *rft;
> + int ret;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + for (rft = rfts; rft != end; rft++) {
> + ret = rdtgroup_add_file(kn, rft);
> + if (ret) {
> + pr_warn("%s: failed to add %s, err=%d\n",
> + __func__, rft->name, ret);
> + rdtgroup_rm_files(kn, rft, end);

So we remove the file which failed to be added along with those which we
have not been added yet.

rdtgroup_rm_files(kn, rfts, rft);

Might be more correct, but I might be wrong as usual.

> +/*
> + * Get resource type from name in kernfs_node. This can be extended to
> + * multi-resources (e.g. L2). Right now simply return RESOURCE_L3 because
> + * we only have L3 support.

That's crap. If you know that you have seperate types then spend the time
to implement the storage instead of documenting your lazy/sloppyness.

> + */
> +static enum resource_type get_kn_res_type(struct kernfs_node *kn)
> +{
> + return RESOURCE_L3;
> +}
> +
> +static int rdt_max_closid_show(struct seq_file *seq, void *v)
> +{
> + struct kernfs_open_file *of = seq->private;
> +
> + switch (get_kn_res_type(of->kn)) {
> + case RESOURCE_L3:
> + seq_printf(seq, "%d\n",
> + boot_cpu_data.x86_l3_max_closid);

x86_l3_max_closid is u16 ..... %u ?

And that line break is required because the line is

seq_printf(seq, "%d\n", boot_cpu_data.x86_l3_max_closid);

exactly 73 characters long ....

> + break;
> + default:
> + break;
> + }
> +
> + return 0;
> +}
> +
> +static int rdt_max_cbm_len_show(struct seq_file *seq, void *v)
> +{
> + struct kernfs_open_file *of = seq->private;
> +
> + switch (get_kn_res_type(of->kn)) {
> + case RESOURCE_L3:
> + seq_printf(seq, "%d\n",
> + boot_cpu_data.x86_l3_max_cbm_len);

Ditto

> + break;
> + default:
> + break;
> + }
> +
> + return 0;
> +}

> +static void rdt_info_show_cat(struct seq_file *seq, int level)
> +{
> + int domain;
> + int domain_num = get_domain_num(level);
> + int closid;
> + u64 cbm;
> + struct clos_cbm_table **cctable;
> + int maxid;
> + int shared_domain;
> + int cnt;

Soon you occupy half of the screen.

> + if (level == CACHE_LEVEL3)
> + cctable = l3_cctable;
> + else
> + return;
> +
> + maxid = cconfig.max_closid;
> + for (domain = 0; domain < domain_num; domain++) {
> + seq_printf(seq, "domain %d:\n", domain);
> + shared_domain = get_shared_domain(domain, level);
> + for (closid = 0; closid < maxid; closid++) {
> + int dindex, iindex;
> +
> + if (test_bit(closid,
> + (unsigned long *)cconfig.closmap[shared_domain])) {
> + dindex = get_dcbm_table_index(closid);
> + cbm = cctable[domain][dindex].cbm;
> + cnt = cctable[domain][dindex].clos_refcnt;
> + seq_printf(seq, "cbm[%d]=%lx, refcnt=%d\n",
> + dindex, (unsigned long)cbm, cnt);
> + if (cdp_enabled) {
> + iindex = get_icbm_table_index(closid);
> + cbm = cctable[domain][iindex].cbm;
> + cnt =
> + cctable[domain][iindex].clos_refcnt;
> + seq_printf(seq,
> + "cbm[%d]=%lx, refcnt=%d\n",
> + iindex, (unsigned long)cbm,
> + cnt);
> + }
> + } else {
> + cbm = max_cbm(level);
> + cnt = 0;
> + dindex = get_dcbm_table_index(closid);
> + seq_printf(seq, "cbm[%d]=%lx, refcnt=%d\n",
> + dindex, (unsigned long)cbm, cnt);
> + if (cdp_enabled) {
> + iindex = get_icbm_table_index(closid);
> + seq_printf(seq,
> + "cbm[%d]=%lx, refcnt=%d\n",
> + iindex, (unsigned long)cbm,
> + cnt);
> + }

This is completely unreadable. Split it into static functions ....

> + }
> + }
> + }
> +}
> +
> +static void show_shared_domain(struct seq_file *seq)
> +{
> + int domain;
> +
> + seq_puts(seq, "Shared domains:\n");
> +
> + for_each_cache_domain(domain, 0, shared_domain_num) {
> + struct shared_domain *sd;
> +
> + sd = &shared_domain[domain];
> + seq_printf(seq, "domain[%d]:", domain);
> + if (cat_enabled(CACHE_LEVEL3))
> + seq_printf(seq, "l3_domain=%d ", sd->l3_domain);
> + seq_printf(seq, "cpumask=%*pb\n",
> + cpumask_pr_args(&sd->cpumask));

What's the value of printing a cpu mask for something which is not enabled?

> + }
> +}
> +
> +static int rdt_info_show(struct seq_file *seq, void *v)
> +{
> + show_shared_domain(seq);
> +
> + if (cat_l3_enabled) {
> + if (rdt_opts.verbose)

Concatenate the conditionals into a single line please.

> + rdt_info_show_cat(seq, CACHE_LEVEL3);
> + }
> +
> + seq_puts(seq, "\n");
> +
> + return 0;
> +}
> +
> +static int res_type_to_level(enum resource_type res_type, int *level)
> +{
> + int ret = 0;
> +
> + switch (res_type) {
> + case RESOURCE_L3:
> + *level = CACHE_LEVEL3;
> + break;
> + case RESOURCE_NUM:
> + ret = -EINVAL;
> + break;
> + }
> +
> + return ret;

Groan. What's wrong with

static int res_type_to_level(type)
{
switch (type) {
case RESOURCE_L3: return CACHE_LEVEL3;
case RESOURCE_NUM: return -EINVAL;
}
}

and at the callsite you do:


> +}
> +
> +static int domain_to_cache_id_show(struct seq_file *seq, void *v)
> +{
> + struct kernfs_open_file *of = seq->private;
> + enum resource_type res_type;
> + int domain;
> + int leaf;
> + int level = 0;
> + int ret;
> +
> + res_type = (enum resource_type)of->kn->parent->priv;
> +
> + ret = res_type_to_level(res_type, &level);
> + if (ret)
> + return 0;

level = res_type_to_level(res_type);
if (level < 0)
return 0;

That gets rid of the initialization of level as well and becomes readable
source code. Hmm?

> +
> + leaf = get_cache_leaf(level, 0);

leafidx = cache_get_leaf_index(...);

I trip over this over and over and I can't get used to this misnomer.

> +
> + for (domain = 0; domain < get_domain_num(level); domain++) {
> + unsigned int cid;
> +
> + cid = cache_domains[leaf].shared_cache_id[domain];
> + seq_printf(seq, "%d:%d\n", domain, cid);

Proper print qualifiers are overrated....

> +static int info_populate_dir(struct kernfs_node *kn)
> +{
> + struct rftype *rfts;
> +
> + rfts = info_files;

struct rftype *rfts = info_files;

> + return rdtgroup_add_files(kn, rfts, rfts + ARRAY_SIZE(info_files));
> +}

> +static int rdtgroup_partition_populate_dir(struct kernfs_node *kn)

Has no user.

> +LIST_HEAD(rdtgroup_lists);

I told you before that globals or module static variable don't get defined
in the middle of the code and not being sticked to a function definition
w/o a space.

> +static void init_rdtgroup_root(struct rdtgroup_root *root)
> +{
> + struct rdtgroup *rdtgrp = &root->rdtgrp;
> +
> + INIT_LIST_HEAD(&rdtgrp->rdtgroup_list);
> + list_add_tail(&rdtgrp->rdtgroup_list, &rdtgroup_lists);
> + atomic_set(&root->nr_rdtgrps, 1);
> + rdtgrp->root = root;

Yuck.

grp = root->grp;
init(grp);
root->nr_grps = 1;
grp->root = root;

Confused.

> +}
> +
> +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops;
> +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
> +{
> + struct rdtgroup *rdtgrp;
> +
> + if (kernfs_type(kn) == KERNFS_DIR)
> + rdtgrp = kn->priv;
> + else
> + rdtgrp = kn->parent->priv;

So this again assumes that there is a single level of directories....

> + kernfs_break_active_protection(kn);
> +
> + mutex_lock(&rdtgroup_mutex);
> + /* Unlock if rdtgrp is dead. */
> + if (!rdtgrp)
> + rdtgroup_kn_unlock(kn);
> +
> + return rdtgrp;
> +}
> +
> +void rdtgroup_kn_unlock(struct kernfs_node *kn)
> +{
> + mutex_unlock(&rdtgroup_mutex);
> +
> + kernfs_unbreak_active_protection(kn);
> +}
> +
> +static char *res_info_dir_name(enum resource_type res_type, char *name)
> +{
> + switch (res_type) {
> + case RESOURCE_L3:
> + strncpy(name, "l3", RDTGROUP_FILE_NAME_LEN);
> + break;
> + default:
> + break;
> + }
> +
> + return name;

What's the purpose of this return value if its ignored at the call site?

> +}
> +
> +static int create_res_info(enum resource_type res_type,
> + struct kernfs_node *parent_kn)
> +{
> + struct kernfs_node *kn;
> + char name[RDTGROUP_FILE_NAME_LEN];
> + int ret;
> +
> + res_info_dir_name(res_type, name);

So name contains random crap if res_type is not handled in res_info_dir_name().

> + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, NULL);
> + if (IS_ERR(kn)) {
> + ret = PTR_ERR(kn);
> + goto out;
> + }
> +
> + /*
> + * This extra ref will be put in kernfs_remove() and guarantees
> + * that @rdtgrp->kn is always accessible.
> + */
> + kernfs_get(kn);
> +
> + ret = rdtgroup_kn_set_ugid(kn);
> + if (ret)
> + goto out_destroy;
> +
> + ret = res_info_populate_dir(kn);
> + if (ret)
> + goto out_destroy;
> +
> + kernfs_activate(kn);
> +
> + ret = 0;
> + goto out;

Hell no.

> +
> +out_destroy:
> + kernfs_remove(kn);
> +out:
> + return ret;
> +
> +}
> +
> +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn,
> + const char *name)
> +{
> + struct kernfs_node *kn;
> + int ret;
> +
> + if (parent_kn != root_rdtgrp->kn)
> + return -EPERM;
> +
> + /* create the directory */
> + kn = kernfs_create_dir(parent_kn, "info", parent_kn->mode, root_rdtgrp);
> + if (IS_ERR(kn)) {
> + ret = PTR_ERR(kn);
> + goto out;
> + }
> +
> + ret = info_populate_dir(kn);
> + if (ret)
> + goto out_destroy;
> +
> + if (cat_enabled(CACHE_LEVEL3))
> + create_res_info(RESOURCE_L3, kn);
> +
> + /*
> + * This extra ref will be put in kernfs_remove() and guarantees
> + * that @rdtgrp->kn is always accessible.
> + */
> + kernfs_get(kn);
> +
> + ret = rdtgroup_kn_set_ugid(kn);
> + if (ret)
> + goto out_destroy;
> +
> + kernfs_activate(kn);
> +
> + ret = 0;
> + goto out;

Copy and paste .... sucks.

> +out_destroy:
> + kernfs_remove(kn);
> +out:
> + return ret;
> +}
> +
> +static int rdtgroup_setup_root(struct rdtgroup_root *root,
> + unsigned long ss_mask)
> +{
> + int ret;
> +
> + root_rdtgrp = &root->rdtgrp;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + root->kf_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
> + KERNFS_ROOT_CREATE_DEACTIVATED,
> + root_rdtgrp);
> + if (IS_ERR(root->kf_root)) {
> + ret = PTR_ERR(root->kf_root);
> + goto out;
> + }
> + root_rdtgrp->kn = root->kf_root->kn;
> +
> + ret = rdtgroup_populate_dir(root->kf_root->kn);
> + if (ret)
> + goto destroy_root;
> +
> + rdtgroup_create_info_dir(root->kf_root->kn, "info_dir");
> +
> + /*
> + * Link the root rdtgroup in this hierarchy into all the css_set

css_set objects ? Again: Copy and paste sucks, when done without brain
involvement.

> + * objects.
> + */
> + WARN_ON(atomic_read(&root->nr_rdtgrps) != 1);
> +
> + kernfs_activate(root_rdtgrp->kn);
> + ret = 0;
> + goto out;
> +
> +destroy_root:
> + kernfs_destroy_root(root->kf_root);
> + root->kf_root = NULL;
> +out:
> + return ret;
> +}

> +static int get_shared_cache_id(int cpu, int level)
> +{
> + struct cpuinfo_x86 *c;
> + int index_msb;
> + struct cpu_cacheinfo *this_cpu_ci;
> + struct cacheinfo *this_leaf;
> +
> + this_cpu_ci = get_cpu_cacheinfo(cpu);

Once more. this_cpu_ci is actively misleading.

> +
> + this_leaf = this_cpu_ci->info_list + level_to_leaf(level);
> + return this_leaf->id;
> + return c->apicid >> index_msb;
> +}

> +static void init_cache_domain(int cpu, int leaf)
> +{
> + struct cpu_cacheinfo *this_cpu_ci;
> + struct cpumask *mask;
> + unsigned int level;
> + struct cacheinfo *this_leaf;
> + int domain;
> +
> + this_cpu_ci = get_cpu_cacheinfo(cpu);
> + this_leaf = this_cpu_ci->info_list + leaf;
> + cache_domains[leaf].level = this_leaf->level;
> + mask = &this_leaf->shared_cpu_map;
> + for (domain = 0; domain < MAX_CACHE_DOMAINS; domain++) {
> + if (cpumask_test_cpu(cpu,
> + &cache_domains[leaf].shared_cpu_map[domain]))
> + return;
> + }
> + if (domain == MAX_CACHE_DOMAINS) {
> + domain = cache_domains[leaf].max_cache_domains_num++;
> +
> + cache_domains[leaf].shared_cpu_map[domain] = *mask;
> +
> + level = cache_domains[leaf].level;
> + cache_domains[leaf].shared_cache_id[domain] =
> + get_shared_cache_id(cpu, level);

I've seen similar code in the other file. Why do we need two incarnations
of that? Can't we have a shared cache domain information storage where all
info is kept for both the control and the user space interface?

> + }
> +}
> +
> +static __init void init_cache_domains(void)
> +{
> + int cpu;
> + int leaf;
> +
> + for (leaf = 0; leaf < get_cpu_cacheinfo(0)->num_leaves; leaf++) {
> + for_each_online_cpu(cpu)
> + init_cache_domain(cpu, leaf);

What updates this stuff on hotplug?

> + }
> +}
> +
> +void rdtgroup_exit(struct task_struct *tsk)
> +{
> +
> + if (!list_empty(&tsk->rg_list)) {

I told you last time that rg_list is a misnomer ....

> + struct rdtgroup *rdtgrp = tsk->rdtgroup;
> +
> + list_del_init(&tsk->rg_list);
> + tsk->rdtgroup = NULL;
> + atomic_dec(&rdtgrp->refcount);

And there is still no sign of documentation on how that list is used and
protected.

> + }
> +}
> +
> +static void rdtgroup_destroy_locked(struct rdtgroup *rdtgrp)
> + __releases(&rdtgroup_mutex) __acquires(&rdtgroup_mutex)

Where?

> +{
> + int shared_domain;
> + int closid;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + /* free closid occupied by this rdtgroup. */
> + for_each_cache_domain(shared_domain, 0, shared_domain_num) {
> + closid = rdtgrp->resource.closid[shared_domain];
> + closid_put(closid, shared_domain);
> + }
> +
> + list_del_init(&rdtgrp->rdtgroup_list);
> +
> + /*
> + * Remove @rdtgrp directory along with the base files. @rdtgrp has an
> + * extra ref on its kn.
> + */
> + kernfs_remove(rdtgrp->kn);
> +}
> +
> +static int
> +rdtgroup_move_task_all(struct rdtgroup *src_rdtgrp, struct rdtgroup *dst_rdtgrp)
> +{
> + struct list_head *tasks;
> +
> + tasks = &src_rdtgrp->pset.tasks;
> + while (!list_empty(tasks)) {

list_for_each_entry_safe() ???

> + struct task_struct *tsk;
> + struct list_head *pos;
> + pid_t pid;
> + int ret;
> +
> + pos = tasks->next;
> + tsk = list_entry(pos, struct task_struct, rg_list);
> + pid = tsk->pid;
> + ret = rdtgroup_move_task(pid, dst_rdtgrp, false, NULL);
> + if (ret)
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * Forcibly remove all of subdirectories under root.
> + */
> +static void rmdir_all_sub(void)
> +{
> + struct rdtgroup *rdtgrp;
> + int cpu;
> + struct list_head *l;
> + struct task_struct *p;
> +
> + /* Move all tasks from sub rdtgroups to default */
> + rcu_read_lock();
> + for_each_process(p) {
> + if (p->rdtgroup)
> + p->rdtgroup = NULL;
> + }
> + rcu_read_unlock();

And how is that protected against concurrent forks?

> +
> + while (!list_is_last(&root_rdtgrp->rdtgroup_list, &rdtgroup_lists)) {
> + l = rdtgroup_lists.next;
> + if (l == &root_rdtgrp->rdtgroup_list)
> + l = l->next;
> +
> + rdtgrp = list_entry(l, struct rdtgroup, rdtgroup_list);
> + if (rdtgrp == root_rdtgrp)
> + continue;
> +
> + for_each_cpu(cpu, &rdtgrp->cpu_mask)
> + per_cpu(cpu_rdtgroup, cpu) = root_rdtgrp;
> +
> + rdtgroup_destroy_locked(rdtgrp);
> + }
> +}

> +static void *rdtgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
> +{
> + return seq_rft(seq)->seq_start(seq, ppos);
> +}
> +
> +static void *rdtgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
> +{
> + return seq_rft(seq)->seq_next(seq, v, ppos);
> +}
> +
> +static void rdtgroup_seqfile_stop(struct seq_file *seq, void *v)
> +{
> + seq_rft(seq)->seq_stop(seq, v);
> +}
> +
> +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
> +{
> + struct rftype *rft = seq_rft(m);
> +
> + if (rft->seq_show)
> + return rft->seq_show(m, arg);
> + return 0;
> +}
> +
> +static struct kernfs_ops rdtgroup_kf_ops = {
> + .atomic_write_len = PAGE_SIZE,
> + .write = rdtgroup_file_write,
> + .seq_start = rdtgroup_seqfile_start,
> + .seq_next = rdtgroup_seqfile_next,
> + .seq_stop = rdtgroup_seqfile_stop,
> + .seq_show = rdtgroup_seqfile_show,
> +};

And once more nothing uses this at all. So why is it there?

> +static struct kernfs_ops rdtgroup_kf_single_ops = {
> + .atomic_write_len = PAGE_SIZE,
> + .write = rdtgroup_file_write,
> + .seq_show = rdtgroup_seqfile_show,
> +};
> +
> +static void rdtgroup_exit_rftypes(struct rftype *rfts)
> +{
> + struct rftype *rft;
> +
> + for (rft = rfts; rft->name[0] != '\0'; rft++) {
> + /* free copy for custom atomic_write_len, see init_cftypes() */
> + if (rft->max_write_len && rft->max_write_len != PAGE_SIZE)
> + kfree(rft->kf_ops);
> + rft->kf_ops = NULL;
> +
> + /* revert flags set by rdtgroup core while adding @cfts */
> + rft->flags &= ~(__RFTYPE_ONLY_ON_DFL | __RFTYPE_NOT_ON_DFL);
> + }
> +}
> +
> +static int rdtgroup_init_rftypes(struct rftype *rfts)
> +{
> + struct rftype *rft;
> +
> + for (rft = rfts; rft->name[0] != '\0'; rft++) {
> + struct kernfs_ops *kf_ops;
> +
> + if (rft->seq_start)
> + kf_ops = &rdtgroup_kf_ops;
> + else
> + kf_ops = &rdtgroup_kf_single_ops;

Ditto.

> +
> + /*
> + * Ugh... if @cft wants a custom max_write_len, we need to
> + * make a copy of kf_ops to set its atomic_write_len.
> + */
> + if (rft->max_write_len && rft->max_write_len != PAGE_SIZE) {
> + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
> + if (!kf_ops) {
> + rdtgroup_exit_rftypes(rfts);
> + return -ENOMEM;
> + }
> + kf_ops->atomic_write_len = rft->max_write_len;

No user either. Copy and paste once more ?

> + }
> +
> + rft->kf_ops = kf_ops;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * rdtgroup_init - rdtgroup initialization
> + *
> + * Register rdtgroup filesystem, and initialize any subsystems that didn't
> + * request early init.
> + */
> +int __init rdtgroup_init(void)
> +{
> + int cpu;
> +
> + WARN_ON(rdtgroup_init_rftypes(rdtgroup_root_base_files));
> +
> + WARN_ON(rdtgroup_init_rftypes(res_info_files));
> + WARN_ON(rdtgroup_init_rftypes(info_files));
> +
> + WARN_ON(rdtgroup_init_rftypes(rdtgroup_partition_base_files));
> + mutex_lock(&rdtgroup_mutex);
> +
> + init_rdtgroup_root(&rdtgrp_dfl_root);
> + WARN_ON(rdtgroup_setup_root(&rdtgrp_dfl_root, 0));
> +
> + mutex_unlock(&rdtgroup_mutex);
> +
> + WARN_ON(sysfs_create_mount_point(fs_kobj, "resctrl"));
> + WARN_ON(register_filesystem(&rdt_fs_type));
> + init_cache_domains();
> +
> + INIT_LIST_HEAD(&rdtgroups);
> +
> + for_each_online_cpu(cpu)
> + per_cpu(cpu_rdtgroup, cpu) = root_rdtgrp;

Another more for each cpu loop. Where is the hotplug update happening?

Thanks,

tglx