Re: [PATCH 4/4] Allow cgroup hierarchies to be created with nobound subsystems

From: KAMEZAWA Hiroyuki
Date: Thu Jul 23 2009 - 04:21:16 EST


On Wed, 22 Jul 2009 12:50:45 -0700
Paul Menage <menage@xxxxxxxxxx> wrote:

> Allow cgroup hierarchies to be created with no bound subsystems
>
> This patch removes the restriction that a cgroup hierarchy must have
> at least one bound subsystem. The mount option "none" is treated as
> an explicit request for no bound subsystems.
>
> A hierarchy with no subsystems can be useful for plain task tracking,
> and is also a step towards the support for multiply-bindable
> subsystems.
>
> As part of this change, the hierarchy id is no longer calculated from
> the bitmask of subsystems in the hierarchy (since this is not
> guaranteed to be unique) but is allocated via an ida. Reference
> counts on cgroups from css_set objects are now taken explicitly one
> per hierarchy, rather than one per subsystem.
>
> Example usage:
>
> mount -t cgroup -o none,name=foo cgroup /mnt/cgroup
>
> Based on the "no-op"/"none" subsystem concept proposed by
> kamezawa.hiroyu@xxxxxxxxxxxxxx
>
> Signed-off-by: Paul Menage <menage@xxxxxxxxxx>
>
> ---
>
> kernel/cgroup.c | 158 ++++++++++++++++++++++++++++++++++---------------------
> 1 files changed, 99 insertions(+), 59 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3a970e0..3648331 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -49,6 +49,7 @@
> #include <linux/namei.h>
> #include <linux/smp_lock.h>
> #include <linux/pid_namespace.h>
> +#include <linux/idr.h>
>
> #include <asm/atomic.h>
>
> @@ -77,6 +78,9 @@ struct cgroupfs_root {
> */
> unsigned long subsys_bits;
>
> + /* Unique id for this hierarchy. */
> + int hierarchy_id;
> +
> /* The bitmask of subsystems currently attached to this hierarchy */
> unsigned long actual_subsys_bits;
>
> @@ -147,6 +151,10 @@ struct css_id {
> static LIST_HEAD(roots);
> static int root_count;
>
> +static DEFINE_IDA(hierarchy_ida);
> +static int next_hierarchy_id;
> +static DEFINE_SPINLOCK(hierarchy_id_lock);
> +
> /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
> #define dummytop (&rootnode.top_cgroup)
>
> @@ -264,42 +272,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
> * compiled into their kernel but not actually in use */
> static int use_task_css_set_links __read_mostly;
>
> -/* When we create or destroy a css_set, the operation simply
> - * takes/releases a reference count on all the cgroups referenced
> - * by subsystems in this css_set. This can end up multiple-counting
> - * some cgroups, but that's OK - the ref-count is just a
> - * busy/not-busy indicator; ensuring that we only count each cgroup
> - * once would require taking a global lock to ensure that no
> - * subsystems moved between hierarchies while we were doing so.
> - *
> - * Possible TODO: decide at boot time based on the number of
> - * registered subsystems and the number of CPUs or NUMA nodes whether
> - * it's better for performance to ref-count every subsystem, or to
> - * take a global lock and only add one ref count to each hierarchy.
> - */
> -
> -/*
> - * unlink a css_set from the list and free it
> - */
> -static void unlink_css_set(struct css_set *cg)
> +static void __put_css_set(struct css_set *cg, int taskexit)
> {
> struct cg_cgroup_link *link;
> struct cg_cgroup_link *saved_link;
> -
> - hlist_del(&cg->hlist);
> - css_set_count--;
> -
> - list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> - cg_link_list) {
> - list_del(&link->cg_link_list);
> - list_del(&link->cgrp_link_list);
> - kfree(link);
> - }
> -}
> -
> -static void __put_css_set(struct css_set *cg, int taskexit)
> -{
> - int i;
> /*
> * Ensure that the refcount doesn't hit zero while any readers
> * can see it. Similar to atomic_dec_and_lock(), but for an
> @@ -312,20 +288,27 @@ static void __put_css_set(struct css_set *cg, int taskexit)
> write_unlock(&css_set_lock);
> return;
> }
> - unlink_css_set(cg);
> - write_unlock(&css_set_lock);
>
> - rcu_read_lock();
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
> + /* This css_set is dead. unlink it and release cgroup refcounts */
> + hlist_del(&cg->hlist);
> + css_set_count--;
> +
> + list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> + cg_link_list) {
> + struct cgroup *cgrp = link->cgrp;
> + list_del(&link->cg_link_list);
> + list_del(&link->cgrp_link_list);
> if (atomic_dec_and_test(&cgrp->count) &&
> notify_on_release(cgrp)) {
> if (taskexit)
> set_bit(CGRP_RELEASABLE, &cgrp->flags);
> check_for_release(cgrp);
> }
> +
> + kfree(link);
> }
> - rcu_read_unlock();
> +
> + write_unlock(&css_set_lock);
> kfree(cg);
> }
>
> @@ -519,6 +502,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
> cgrp_link_list);
> link->cg = cg;
> link->cgrp = cgrp;
> + atomic_inc(&cgrp->count);
> list_move(&link->cgrp_link_list, &cgrp->css_sets);
> /*
> * Always add links to the tail of the list so that the list
> @@ -539,7 +523,6 @@ static struct css_set *find_css_set(
> {
> struct css_set *res;
> struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> - int i;
>
> struct list_head tmp_cg_links;
>
> @@ -578,10 +561,6 @@ static struct css_set *find_css_set(
>
> write_lock(&css_set_lock);
> /* Add reference counts and links from the new css_set. */
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> - struct cgroup *cgrp = res->subsys[i]->cgroup;
> - atomic_inc(&cgrp->count);
> - }
> list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
> struct cgroup *c = link->cgrp;
> if (c->root == cgrp->root)
> @@ -960,8 +939,11 @@ struct cgroup_sb_opts {
> unsigned long flags;
> char *release_agent;
> char *name;
> + /* User explicitly requested empty subsystem */
> + bool none;
>
> struct cgroupfs_root *new_root;
> +
> };
>
> /* Convert a hierarchy specifier into a bitmask of subsystems and
> @@ -990,6 +972,9 @@ static int parse_cgroupfs_options(char *data,
> if (!ss->disabled)
> opts->subsys_bits |= 1ul << i;
> }
> + } else if (!strcmp(token, "none")) {
> + /* Explicitly have no subsystems */
> + opts->none = true;
> } else if (!strcmp(token, "noprefix")) {
> set_bit(ROOT_NOPREFIX, &opts->flags);
> } else if (!strncmp(token, "release_agent=", 14)) {
> @@ -1039,6 +1024,8 @@ static int parse_cgroupfs_options(char *data,
> }
> }
>
> + /* Consistency checks */
> +
> /*
> * Option noprefix was introduced just for backward compatibility
> * with the old cpuset, so we allow noprefix only if mounting just
> @@ -1048,7 +1035,15 @@ static int parse_cgroupfs_options(char *data,
> (opts->subsys_bits & mask))
> return -EINVAL;
>
> - /* We can't have an empty hierarchy */
> +
> + /* Can't specify "none" and some subsystems */
> + if (opts->subsys_bits && opts->none)
> + return -EINVAL;

Is this case never happens ?

BUG_ON(!opts->subsys_bits && !opts->none)


> +
> + /*
> + * We either have to specify by name or by subsystems. (So all
> + * empty hierarchies must have a name).
> + */
> if (!opts->subsys_bits && !opts->name)
> return -EINVAL;
>
> @@ -1129,6 +1124,31 @@ static void init_cgroup_root(struct cgroupfs_root *root)
> init_cgroup_housekeeping(cgrp);
> }
>
> +static bool init_root_id(struct cgroupfs_root *root)
> +{
> + int ret = 0;
> +
> + do {
> + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
> + return false;
> + spin_lock(&hierarchy_id_lock);
> + /* Try to allocate the next unused ID */
> + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
> + &root->hierarchy_id);

Why new id should be greater than old one ?
Just for avoiding reuse-id-too-soon ?

> + if (ret == -ENOSPC)
> + /* Try again starting from 0 */
> + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
> + if (!ret) {
> + next_hierarchy_id = root->hierarchy_id + 1;
> + } else if (ret != -EAGAIN) {
> + /* Can only get here if the 31-bit IDR is full ... */
> + BUG_ON(ret);
> + }
> + spin_unlock(&hierarchy_id_lock);
> + } while (ret);
> + return true;
> +}
> +
> static int cgroup_test_super(struct super_block *sb, void *data)
> {
> struct cgroup_sb_opts *opts = data;
> @@ -1138,8 +1158,12 @@ static int cgroup_test_super(struct super_block *sb, void *data)
> if (opts->name && strcmp(opts->name, root->name))
> return 0;
>
> - /* If we asked for subsystems then they must match */
> - if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits))
> + /*
> + * If we asked for subsystems (or explicitly for no
> + * subsystems) then they must match
> + */
> + if ((opts->subsys_bits || opts->none)
> + && (opts->subsys_bits != root->subsys_bits))
> return 0;
>
> return 1;
> @@ -1149,15 +1173,19 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> {
> struct cgroupfs_root *root;
>
> - /* Empty hierarchies aren't supported */
> - if (!opts->subsys_bits)
> + if (!opts->subsys_bits && !opts->none)
> return NULL;
>
> root = kzalloc(sizeof(*root), GFP_KERNEL);
> if (!root)
> return ERR_PTR(-ENOMEM);
>
> + if (!init_root_id(root)) {
> + kfree(root);
> + return ERR_PTR(-ENOMEM);
> + }
> init_cgroup_root(root);
> +
> root->subsys_bits = opts->subsys_bits;
> root->flags = opts->flags;
> if (opts->release_agent)
> @@ -1167,6 +1195,18 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> return root;
> }
>
> +static void cgroup_drop_root(struct cgroupfs_root *root)
> +{
> + if (!root)
> + return;
> +
> + BUG_ON(!root->hierarchy_id);
> + spin_lock(&hierarchy_id_lock);
> + ida_remove(&hierarchy_ida, root->hierarchy_id);
> + spin_unlock(&hierarchy_id_lock);
> + kfree(root);
> +}
> +
> static int cgroup_set_super(struct super_block *sb, void *data)
> {
> int ret;
> @@ -1176,7 +1216,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
> if (!opts->new_root)
> return -EINVAL;
>
> - BUG_ON(!opts->subsys_bits);
> + BUG_ON(!opts->subsys_bits && !opts->none);
>
> ret = set_anon_super(sb, NULL);
> if (ret)
> @@ -1246,7 +1286,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
> if (IS_ERR(sb)) {
> ret = PTR_ERR(sb);
> - kfree(opts.new_root);
> + cgroup_drop_root(opts.new_root);
> goto out_err;
> }
>
> @@ -1340,7 +1380,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> * We re-used an existing hierarchy - the new root (if
> * any) is not needed
> */
> - kfree(opts.new_root);
> + cgroup_drop_root(opts.new_root);
> }
>
> simple_set_mnt(mnt, sb);
> @@ -1400,7 +1440,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
> mutex_unlock(&cgroup_mutex);
>
> kill_litter_super(sb);
> - kfree(root);
> + cgroup_drop_root(root);
> }
>
> static struct file_system_type cgroup_fs_type = {
> @@ -3090,7 +3130,7 @@ int __init cgroup_init(void)
> /* Add init_css_set to the hash table */
> hhead = css_set_hash(init_css_set.subsys);
> hlist_add_head(&init_css_set.hlist, hhead);
> -
> + BUG_ON(!init_root_id(&rootnode));
> err = register_filesystem(&cgroup_fs_type);
> if (err < 0)
> goto out;
> @@ -3145,7 +3185,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
> struct cgroup *cgrp;
> int count = 0;
>
> - seq_printf(m, "%lu:", root->subsys_bits);
> + seq_printf(m, "%d:", root->hierarchy_id);
> for_each_subsys(root, ss)
> seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
> if (strlen(root->name))
> @@ -3191,8 +3231,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
> mutex_lock(&cgroup_mutex);
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> struct cgroup_subsys *ss = subsys[i];
> - seq_printf(m, "%s\t%lu\t%d\t%d\n",
> - ss->name, ss->root->subsys_bits,
> + seq_printf(m, "%s\t%d\t%d\t%d\n",
> + ss->name, ss->root->hierarchy_id,
> ss->root->number_of_cgroups, !ss->disabled);
> }
> mutex_unlock(&cgroup_mutex);
> @@ -3910,8 +3950,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
> name = c->dentry->d_name.name;
> else
> name = "?";
> - seq_printf(seq, "Root %lu group %s\n",
> - c->root->subsys_bits, name);
> + seq_printf(seq, "Root %d group %s\n",
> + c->root->hierarchy_id, name);
> }

I'm not sure but this "id" is worth to be printed ?

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/