Re: [PATCH 25/32] x86/intel_rdt_rdtgroup.c: User interface for RDT

From: Thomas Gleixner
Date: Thu Jul 14 2016 - 08:33:33 EST


On Tue, 12 Jul 2016, Fenghua Yu wrote:
>
> +extern struct cache_domain cache_domains[MAX_CACHE_LEAVES];
> +
> +

Why only two extra new lines?

> extern struct rdt_opts rdt_opts;
> +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> @@ -0,0 +1,881 @@
> +/*
> + * Resource Director Technology(RDT)
> + * - User interface for Resource Alloction in RDT.
> + *
> + * Copyright (C) 2016 Intel Corporation
> + *
> + * 2016 Written by
> + * Fenghua Yu <fenghua.yu@xxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * More information about RDT be found in the Intel (R) x86 Architecture
> + * Software Developer Manual.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/cred.h>
> +#include <linux/ctype.h>
> +#include <linux/errno.h>
> +#include <linux/init_task.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/magic.h>
> +#include <linux/mm.h>
> +#include <linux/mutex.h>
> +#include <linux/mount.h>
> +#include <linux/pagemap.h>
> +#include <linux/proc_fs.h>
> +#include <linux/rcupdate.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/percpu-rwsem.h>
> +#include <linux/string.h>
> +#include <linux/sort.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/idr.h>
> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

You want to replace vmalloc with a more sophisticated array? Good luck with
that. Get rid of those nonsensical comments in random places and please do not
use tail comments at all.

> +#include <linux/kthread.h>
> +#include <linux/delay.h>
> +#include <linux/atomic.h>
> +#include <linux/cpumask.h>
> +#include <linux/cacheinfo.h>
> +#include <linux/cacheinfo.h>
> +#include <net/sock.h>
> +#include <asm/intel_rdt_rdtgroup.h>
> +#include <asm/intel_rdt.h>

Are you sure that you need all of these includes?

> +/**
> + * kernfs_root - find out the kernfs_root a kernfs_node belongs to
> + * @kn: kernfs_node of interest
> + *
> + * Return the kernfs_root @kn belongs to.
> + */
> +static inline struct kernfs_root *get_kernfs_root(struct kernfs_node *kn)
> +{
> + /* if parent exists, it's always a dir; otherwise, @sd is a dir */

Why? That's non obvious for the casual reader ....

> + if (kn->parent)
> + kn = kn->parent;
> + return kn->dir.root;
> +}
> +
> +/*
> + * Protects rdtgroup_idr so that IDs can be released without grabbing
> + * rdtgroup_mutex.
> + */
> +static DEFINE_SPINLOCK(rdtgroup_idr_lock);
> +
> +struct percpu_rw_semaphore rdtgroup_threadgroup_rwsem;
> +
> +#define MAX_CPUMASK_CHAR_IN_HEX (NR_CPUS/4)

Oh well. This is used to size a buffer. At the usage site you do

char buf[MAX_CPUMASK_CHAR_IN_HEX + 1];

What's wrong with giving this thing a understandable name and do the '+ 1'
right at the macro definition?

#define CPUMASK_BUF_LEN ((NR_CPUS / 4) + 1)

> +
> +static struct rftype rdtgroup_root_base_files[];
> +
> +#define RDTGROUP_FILE_NAME_MAX (MAX_RDTGROUP_TYPE_NAMELEN + \
> + MAX_RFTYPE_NAME + 2)

So that's the maximum number of rdt group file names or what?

s/MAX/LEN/ perhaps?

> +static char *rdtgroup_file_name(const struct rftype *rft, char *buf)
> +{
> + strncpy(buf, rft->name, RDTGROUP_FILE_NAME_MAX);

Do we really need a wrapper around strncpy() with a completely useless
function name ?

> + return buf;
> +}
> +
> +/**
> + * rdtgroup_file_mode - deduce file mode of a control file
> + * @cft: the control file in question
> + *
> + * S_IRUGO for read, S_IWUSR for write.
> + */
> +static umode_t rdtgroup_file_mode(const struct rftype *rft)
> +{
> + umode_t mode = 0;
> +
> + if (rft->read_u64 || rft->read_s64 || rft->seq_show)
> + mode |= S_IRUGO;
> +
> + if (rft->write_u64 || rft->write_s64 || rft->write) {
> + if (rft->flags & RFTYPE_WORLD_WRITABLE)

Why would any of this be world writeable?

Can you please document the permission rules of this file system meticously
and provide sensible arguments for your choices?

> + mode |= S_IWUGO;
> + else
> + mode |= S_IWUSR;
> + }
> +
> + return mode;
> +}
> +
> +struct rdtgroup *root_rdtgrp;

Can you please place global variables at the top of the file and not glue them
without a newline to a static function?

> +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
> +{
> + char name[RDTGROUP_FILE_NAME_MAX];
> + struct kernfs_node *kn;
> + struct lock_class_key *key = NULL;
> + int ret;
> +
> + kn = __kernfs_create_file(parent_kn, rdtgroup_file_name(rft, name),
> + rdtgroup_file_mode(rft), 0, rft->kf_ops, rft,
> + NULL, key);
> + if (IS_ERR(kn))
> + return PTR_ERR(kn);
> +
> + ret = rdtgroup_kn_set_ugid(kn);
> + if (ret) {
> + kernfs_remove(kn);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +static void rdtgroup_rm_file(struct kernfs_node *kn, const struct rftype *rft)
> +{
> + char name[RDTGROUP_FILE_NAME_MAX];
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + kernfs_remove_by_name(kn, rdtgroup_file_name(rft, name));
> +}
> +
> +static int rdtgroup_addrm_files(struct kernfs_node *kn, struct rftype rfts[],
> + bool is_add)
> +{
> + struct rftype *rft, *rft_end = NULL;
> + int ret;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> +restart:
> + for (rft = rfts; rft != rft_end && rft->name[0] != '\0'; rft++) {

That rtf->name[0] condition is ugly.

> + if (is_add) {
> + ret = rdtgroup_add_file(kn, rft);
> + if (ret) {
> + pr_warn("%s: failed to add %s, err=%d\n",
> + __func__, rft->name, ret);
> + rft_end = rft;
> + is_add = false;
> + goto restart;
> + }
> + } else {
> + rdtgroup_rm_file(kn, rft);
> + }
> + }
> + return 0;
> +}

This is crack. Really. I had to read this 5 times to understand the convoluted
logic here. What are you saving by this is_add magic?

You trade an extra for (....) loop for readability.

Whats' wrong with:

rdtgroup_rm_files(struct kernfs_node *kn, const struct rftype *rft,
const struct rftype *end)
{
for (; rft != end, rft++)
rdtgroup_rm_file(kn, rft);
}

rdtgroup_add_files(struct kernfs_node *kn, const struct rftype *rft,
const struct rftype *end)
{
const struct rtfype *p = rft;

for (p = rft; p != end, p++) {
int ret = rdtgroup_add_file(kn, p);

if (ret) {
rdtgroup_rm_files(kn, rtf, p);
return ret;
}
}
return 0;
}

That's too easy to read and understand, right?

And the callsite simply does:

rdtgroup_add_files(kn, rtf, rft + ARRAY_SIZE(rtf));

> +static enum resource_type get_kn_res_type(struct kernfs_node *kn)
> +{
> + return RESOURCE_L3;

Errm. Can we please have a way to store the resource type right away. We know
already that we need it.

> +}
> +
> +static int rdt_max_closid_show(struct seq_file *seq, void *v)
> +{
> + struct kernfs_open_file *of = seq->private;
> + enum resource_type res_type;
> +
> + res_type = get_kn_res_type(of->kn);
> +
> + switch (res_type) {

switch (get_kn_res_type(of->kn)) {

Perhaps?

> + case RESOURCE_L3:
> + seq_printf(seq, "%d\n",
> + boot_cpu_data.x86_l3_max_closid);
> + break;

> +static int get_shared_domain(int domain, int level)

Nothing is using this in this patch. So why gets this introduced here? Darn,
this add/modify random code without context makes review simply impossible.

> +{
> + int sd;
> +
> + for_each_cache_domain(sd, 0, shared_domain_num) {
> + if (cat_l3_enabled && level == CACHE_LEVEL3) {
> + if (shared_domain[sd].l3_domain == domain)
> + return sd;
> + }
> + }
> +
> + return -1;
> +}


> +static int rdtgroup_procs_write_permission(struct task_struct *task,
> + struct kernfs_open_file *of)
> +{
> + const struct cred *cred = current_cred();
> + const struct cred *tcred = get_task_cred(task);
> + int ret = 0;
> +
> + /*
> + * even if we're attaching all tasks in the thread group, we only
> + * need to check permissions on one of them.
> + */
> + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
> + !uid_eq(cred->euid, tcred->uid) &&
> + !uid_eq(cred->euid, tcred->suid))
> + ret = -EACCES;

Why EACCES? EPERM is the canonical error code here

> +
> + put_cred(tcred);
> + return ret;
> +}
> +
> +bool use_rdtgroup_tasks;

Another random global variable at some random place

> +
> +static void init_rdtgroup_housekeeping(struct rdtgroup *rdtgrp)
> +{
> + init_waitqueue_head(&rdtgrp->offline_waitq);
> + rdtgrp->pset.self = rdtgrp;
> + INIT_LIST_HEAD(&rdtgrp->pset.task_iters);
> +}
> +
> +static LIST_HEAD(rdtgroup_lists);
> +static void init_rdtgroup_root(struct rdtgroup_root *root)

Sigh. This coding style sucks

> +{
> + struct rdtgroup *rdtgrp = &root->rdtgrp;
> +
> + INIT_LIST_HEAD(&root->root_list);
> + INIT_LIST_HEAD(&rdtgrp->rdtgroup_list);
> + list_add_tail(&rdtgrp->rdtgroup_list, &rdtgroup_lists);
> + atomic_set(&root->nr_rdtgrps, 1);
> + rdtgrp->root = root;
> + init_rdtgroup_housekeeping(rdtgrp);
> + idr_init(&root->rdtgroup_idr);
> +}
> +
> +static DEFINE_IDR(rdtgroup_hierarchy_idr);
> +static int rdtgroup_init_root_id(struct rdtgroup_root *root)
> +{
> + int id;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + id = idr_alloc_cyclic(&rdtgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
> + if (id < 0)
> + return id;
> +
> + root->hierarchy_id = id;
> + return 0;
> +}
> +
> +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops;
> +/* IDR wrappers which synchronize using rdtgroup_idr_lock */
> +static int rdtgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
> + gfp_t gfp_mask)
> +{
> + int ret;
> +
> + idr_preload(gfp_mask);
> + spin_lock_bh(&rdtgroup_idr_lock);

Why is this lock_bh? Against which softirq is this protecting?

> + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
> + spin_unlock_bh(&rdtgroup_idr_lock);
> + idr_preload_end();
> + return ret;
> +}
> +
> +/* hierarchy ID allocation and mapping, protected by rdtgroup_mutex */
> +static void rdtgroup_exit_root_id(struct rdtgroup_root *root)
> +{
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + if (root->hierarchy_id) {
> + idr_remove(&rdtgroup_hierarchy_idr, root->hierarchy_id);
> + root->hierarchy_id = 0;
> + }
> +}
> +
> +static struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
> +{
> + struct rdtgroup *rdtgrp;
> +
> + if (kernfs_type(kn) == KERNFS_DIR)
> + rdtgrp = kn->priv;
> + else
> + rdtgrp = kn->parent->priv;
> +
> + kernfs_break_active_protection(kn);

So you drop the kernfs protection here. What makes sure that rdtgrp is still
valid? Documentation of locking and protection rules is not optional, really.

> +
> + mutex_lock(&rdtgroup_mutex);
> +
> + return rdtgrp;
> +}

> +static int rdtgroup_setup_root(struct rdtgroup_root *root,
> + unsigned long ss_mask)
> +{
> + int ret;
> +
> + root_rdtgrp = &root->rdtgrp;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + ret = rdtgroup_idr_alloc(&root->rdtgroup_idr, root_rdtgrp,
> + 1, 2, GFP_KERNEL);
> + if (ret < 0)
> + goto out;
> +
> + root_rdtgrp->id = ret;
> + root_rdtgrp->ancestor_ids[0] = ret;
> +
> + ret = rdtgroup_init_root_id(root);
> + if (ret)
> + goto cancel_ref;

cancel_ref is empty. What's the point? And what is undoing idr_alloc ???

> +
> + root->kf_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
> + KERNFS_ROOT_CREATE_DEACTIVATED,
> + root_rdtgrp);
> + if (IS_ERR(root->kf_root)) {
> + ret = PTR_ERR(root->kf_root);
> + goto exit_root_id;
> + }
> + root_rdtgrp->kn = root->kf_root->kn;
> +
> + ret = rdtgroup_populate_dir(root->kf_root->kn);
> + if (ret)
> + goto destroy_root;
> +
> + /*
> + * Link the root rdtgroup in this hierarchy into all the css_set
> + * objects.
> + */
> + WARN_ON(atomic_read(&root->nr_rdtgrps) != 1);
> +
> + kernfs_activate(root_rdtgrp->kn);
> + ret = 0;
> + goto out;
> +
> +destroy_root:
> + kernfs_destroy_root(root->kf_root);
> + root->kf_root = NULL;
> +exit_root_id:
> + rdtgroup_exit_root_id(root);
> +cancel_ref:
> +out:
> + return ret;
> +}
> +
> +#define cache_leaves(cpu) (get_cpu_cacheinfo(cpu)->num_leaves)

Do we really need this? What's wrong with having it open coded at the only
usage site? That would make the code too obvious, right?

> +struct cache_domain cache_domains[MAX_CACHE_LEAVES];
> +
> +static int get_shared_cache_id(int cpu, int level)
> +{
> + struct cpuinfo_x86 *c;
> + int index_msb;
> + struct cpu_cacheinfo *this_cpu_ci;
> + struct cacheinfo *this_leaf;
> +
> + this_cpu_ci = get_cpu_cacheinfo(cpu);
> +
> + this_leaf = this_cpu_ci->info_list + level_to_leaf(level);
> + return this_leaf->id;
> + return c->apicid >> index_msb;
> +}
> +
> +static __init void init_cache_domains(void)
> +{
> + int cpu, domain;
> + struct cpu_cacheinfo *this_cpu_ci;
> + struct cacheinfo *this_leaf;
> + int leaves;
> + char buf[MAX_CPUMASK_CHAR_IN_HEX + 1];

That's 2k stack for CONFIG_MAXSMP=y and if the SGI folks keep going it will be
more..... Make this a static __initdata buf ...

> + unsigned int level;
> +
> + for (leaves = 0; leaves < cache_leaves(0); leaves++) {

Why leaves? The iterator deals with a single leaf, right?

> + for_each_online_cpu(cpu) {

So this relies on the fact that user space is not yet up and running so the
cpu map cannot change, right?

> + struct cpumask *mask;
> +
> + this_cpu_ci = get_cpu_cacheinfo(cpu);
> + this_leaf = this_cpu_ci->info_list + leaves;
> + cache_domains[leaves].level = this_leaf->level;
> + mask = &this_leaf->shared_cpu_map;
> + cpumap_print_to_pagebuf(false, buf, mask);
> + for (domain = 0; domain < MAX_CACHE_DOMAINS; domain++) {
> + if (cpumask_test_cpu(cpu,
> + &cache_domains[leaves].shared_cpu_map[domain]))
> + break;
> + }

If you end up with complete silly linebreaks it's time to think about
splitting the functions up.

for (leaf = 0; leaf < cache_leaves(0); leaf++) {
for_each_online_cpu(cpu)
init_cache_domain(cpu, leaf);
}

That gets rid of two indentation levels and makes the above and the below
readable. But it's more time consuming to come up with unreadable line breaks
and therefor it must be better, right?

> + if (domain == MAX_CACHE_DOMAINS) {
> + domain =
> + cache_domains[leaves].max_cache_domains_num++;
> +
> + cache_domains[leaves].shared_cpu_map[domain] =
> + *mask;
> +
> + level = cache_domains[leaves].level;
> + cache_domains[leaves].shared_cache_id[domain] =
> + get_shared_cache_id(cpu, level);
> + }
> + }
> + }
> +}

I'm giving up for now.

tglx