Re: [PATCH 21/34] kernfs, sysfs, cgroup, intel_rdt: Support fs_context [ver #12]

From: Andrei Vagin
Date: Sun Nov 18 2018 - 23:23:54 EST


On Fri, Sep 21, 2018 at 05:33:01PM +0100, David Howells wrote:
> Make kernfs support superblock creation/mount/remount with fs_context.
>
> This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
> be made to support fs_context also.
>
> Notes:
>
> (1) A kernfs_fs_context struct is created to wrap fs_context and the
> kernfs mount parameters are moved in here (or are in fs_context).
>
> (2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
> namespace tag parameter is passed in the context if desired
>
> (3) kernfs_free_fs_context() is provided as a destructor for the
> kernfs_fs_context struct, but for the moment it does nothing except
> get called in the right places.
>
> (4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
> pass, but possibly this should be done anyway in case someone wants to
> add a parameter in future.
>
> (5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
> the cgroup v1 and v2 mount parameters are all moved there.
>
> (6) cgroup1 parameter parsing error messages are now handled by invalf(),
> which allows userspace to collect them directly.
>
> (7) cgroup1 parameter cleanup is now done in the context destructor rather
> than in the mount/get_tree and remount functions.
>
> Weirdies:
>
> (*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
> but then uses the resulting pointer after dropping the locks. I'm
> told this is okay and needs commenting.
>
> (*) The cgroup refcount web. This really needs documenting.
>
> (*) cgroup2 only has one root?
>
> Add a suggestion from Thomas Gleixner in which the RDT enablement code is
> placed into its own function.
>
> Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
> cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
> cc: Tejun Heo <tj@xxxxxxxxxx>
> cc: Li Zefan <lizefan@xxxxxxxxxx>
> cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> cc: cgroups@xxxxxxxxxxxxxxx
> cc: fenghua.yu@xxxxxxxxx
> ---
>
> arch/x86/kernel/cpu/intel_rdt.h | 15 +
> arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 183 ++++++++++------
> fs/kernfs/mount.c | 88 ++++----
> fs/sysfs/mount.c | 67 ++++--
> include/linux/cgroup.h | 3
> include/linux/kernfs.h | 39 ++-
> kernel/cgroup/cgroup-internal.h | 50 +++-
> kernel/cgroup/cgroup-v1.c | 345 ++++++++++++++++--------------
> kernel/cgroup/cgroup.c | 264 +++++++++++++++--------
> kernel/cgroup/cpuset.c | 4
> 10 files changed, 640 insertions(+), 418 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
> index 4e588f36228f..1461adc2c5e8 100644
> --- a/arch/x86/kernel/cpu/intel_rdt.h
> +++ b/arch/x86/kernel/cpu/intel_rdt.h
> @@ -33,6 +33,21 @@
> #define RMID_VAL_ERROR BIT_ULL(63)
> #define RMID_VAL_UNAVAIL BIT_ULL(62)
>
> +
> +struct rdt_fs_context {
> + struct kernfs_fs_context kfc;
> + bool enable_cdpl2;
> + bool enable_cdpl3;
> + bool enable_mba_mbps;
> +};
> +
> +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
> +{
> + struct kernfs_fs_context *kfc = fc->fs_private;
> +
> + return container_of(kfc, struct rdt_fs_context, kfc);
> +}
> +
> DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
>
> /**
> diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> index d6cb04c3a28b..34733a221669 100644
> --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> @@ -24,6 +24,7 @@
> #include <linux/cpu.h>
> #include <linux/debugfs.h>
> #include <linux/fs.h>
> +#include <linux/fs_parser.h>
> #include <linux/sysfs.h>
> #include <linux/kernfs.h>
> #include <linux/seq_buf.h>
> @@ -1707,43 +1708,6 @@ static void cdp_disable_all(void)
> cdpl2_disable();
> }
>
> -static int parse_rdtgroupfs_options(char *data)
> -{
> - char *token, *o = data;
> - int ret = 0;
> -
> - while ((token = strsep(&o, ",")) != NULL) {
> - if (!*token) {
> - ret = -EINVAL;
> - goto out;
> - }
> -
> - if (!strcmp(token, "cdp")) {
> - ret = cdpl3_enable();
> - if (ret)
> - goto out;
> - } else if (!strcmp(token, "cdpl2")) {
> - ret = cdpl2_enable();
> - if (ret)
> - goto out;
> - } else if (!strcmp(token, "mba_MBps")) {
> - ret = set_mba_sc(true);
> - if (ret)
> - goto out;
> - } else {
> - ret = -EINVAL;
> - goto out;
> - }
> - }
> -
> - return 0;
> -
> -out:
> - pr_err("Invalid mount option \"%s\"\n", token);
> -
> - return ret;
> -}
> -
> /*
> * We don't allow rdtgroup directories to be created anywhere
> * except the root directory. Thus when looking for the rdtgroup
> @@ -1815,13 +1779,27 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
> struct rdtgroup *prgrp,
> struct kernfs_node **mon_data_kn);
>
> -static struct dentry *rdt_mount(struct file_system_type *fs_type,
> - int flags, const char *unused_dev_name,
> - void *data, size_t data_size)
> +static int rdt_enable_ctx(struct rdt_fs_context *ctx)
> +{
> + int ret = 0;
> +
> + if (ctx->enable_cdpl2)
> + ret = cdpl2_enable();
> +
> + if (!ret && ctx->enable_cdpl3)
> + ret = cdpl3_enable();
> +
> + if (!ret && ctx->enable_mba_mbps)
> + ret = set_mba_sc(true);
> +
> + return ret;
> +}
> +
> +static int rdt_get_tree(struct fs_context *fc)
> {
> + struct rdt_fs_context *ctx = rdt_fc2context(fc);
> struct rdt_domain *dom;
> struct rdt_resource *r;
> - struct dentry *dentry;
> int ret;
>
> cpus_read_lock();
> @@ -1830,53 +1808,42 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
> * resctrl file system can only be mounted once.
> */
> if (static_branch_unlikely(&rdt_enable_key)) {
> - dentry = ERR_PTR(-EBUSY);
> + ret = -EBUSY;
> goto out;
> }
>
> - ret = parse_rdtgroupfs_options(data);
> - if (ret) {
> - dentry = ERR_PTR(ret);
> + ret = rdt_enable_ctx(ctx);
> + if (ret < 0)
> goto out_cdp;
> - }
>
> closid_init();
>
> ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
> - if (ret) {
> - dentry = ERR_PTR(ret);
> - goto out_cdp;
> - }
> + if (ret < 0)
> + goto out_mba;
>
> if (rdt_mon_capable) {
> ret = mongroup_create_dir(rdtgroup_default.kn,
> NULL, "mon_groups",
> &kn_mongrp);
> - if (ret) {
> - dentry = ERR_PTR(ret);
> + if (ret < 0)
> goto out_info;
> - }
> kernfs_get(kn_mongrp);
>
> ret = mkdir_mondata_all(rdtgroup_default.kn,
> &rdtgroup_default, &kn_mondata);
> - if (ret) {
> - dentry = ERR_PTR(ret);
> + if (ret < 0)
> goto out_mongrp;
> - }
> kernfs_get(kn_mondata);
> rdtgroup_default.mon.mon_data_kn = kn_mondata;
> }
>
> ret = rdt_pseudo_lock_init();
> - if (ret) {
> - dentry = ERR_PTR(ret);
> + if (ret)
> goto out_mondata;
> - }
>
> - dentry = kernfs_mount(fs_type, flags, rdt_root,
> - RDTGROUP_SUPER_MAGIC, NULL);
> - if (IS_ERR(dentry))
> + ret = kernfs_get_tree(fc);
> + if (ret < 0)
> goto out_psl;
>
> if (rdt_alloc_capable)
> @@ -1905,14 +1872,97 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
> kernfs_remove(kn_mongrp);
> out_info:
> kernfs_remove(kn_info);
> +out_mba:
> + if (ctx->enable_mba_mbps)
> + set_mba_sc(false);
> out_cdp:
> cdp_disable_all();
> out:
> rdt_last_cmd_clear();
> mutex_unlock(&rdtgroup_mutex);
> cpus_read_unlock();
> + return ret;
> +}
> +
> +enum rdt_param {
> + Opt_cdp,
> + Opt_cdpl2,
> + Opt_mba_mpbs,
> + nr__rdt_params
> +};
> +
> +static const struct fs_parameter_spec rdt_param_specs[nr__rdt_params] = {
> + [Opt_cdp] = { fs_param_is_flag },
> + [Opt_cdpl2] = { fs_param_is_flag },
> + [Opt_mba_mpbs] = { fs_param_is_flag },
> +};
> +
> +static const char *const rdt_param_keys[nr__rdt_params] = {
> + [Opt_cdp] = "cdp",
> + [Opt_cdpl2] = "cdpl2",
> + [Opt_mba_mpbs] = "mba_mbps",
> +};
> +
> +static const struct fs_parameter_description rdt_fs_parameters = {
> + .name = "rdt",
> + .nr_params = nr__rdt_params,
> + .keys = rdt_param_keys,
> + .specs = rdt_param_specs,
> + .no_source = true,
> +};
> +
> +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
> +{
> + struct rdt_fs_context *ctx = rdt_fc2context(fc);
> + struct fs_parse_result result;
> + int opt;
>
> - return dentry;
> + opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
> + if (opt < 0)
> + return opt;
> +
> + switch (opt) {
> + case Opt_cdp:
> + ctx->enable_cdpl3 = true;
> + return 0;
> + case Opt_cdpl2:
> + ctx->enable_cdpl2 = true;
> + return 0;
> + case Opt_mba_mpbs:
> + ctx->enable_mba_mbps = true;
> + return 0;
> + }
> +
> + return -EINVAL;
> +}
> +
> +static void rdt_fs_context_free(struct fs_context *fc)
> +{
> + struct rdt_fs_context *ctx = rdt_fc2context(fc);
> +
> + kernfs_free_fs_context(fc);
> + kfree(ctx);
> +}
> +
> +static const struct fs_context_operations rdt_fs_context_ops = {
> + .free = rdt_fs_context_free,
> + .parse_param = rdt_parse_param,
> + .get_tree = rdt_get_tree,
> +};
> +
> +static int rdt_init_fs_context(struct fs_context *fc, struct dentry *reference)
> +{
> + struct rdt_fs_context *ctx;
> +
> + ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
> + if (!ctx)
> + return -ENOMEM;
> +
> + ctx->kfc.root = rdt_root;
> + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
> + fc->fs_private = &ctx->kfc;
> + fc->ops = &rdt_fs_context_ops;
> + return 0;
> }
>
> static int reset_all_ctrls(struct rdt_resource *r)
> @@ -2085,9 +2135,10 @@ static void rdt_kill_sb(struct super_block *sb)
> }
>
> static struct file_system_type rdt_fs_type = {
> - .name = "resctrl",
> - .mount = rdt_mount,
> - .kill_sb = rdt_kill_sb,
> + .name = "resctrl",
> + .init_fs_context = rdt_init_fs_context,
> + .parameters = &rdt_fs_parameters,
> + .kill_sb = rdt_kill_sb,
> };
>
> static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index f70e0b69e714..56742632956c 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -22,14 +22,13 @@
>
> struct kmem_cache *kernfs_node_cache;
>
> -static int kernfs_sop_remount_fs(struct super_block *sb, int *flags,
> - char *data, size_t data_size)
> +int kernfs_reconfigure(struct fs_context *fc)
> {
> - struct kernfs_root *root = kernfs_info(sb)->root;
> + struct kernfs_root *root = kernfs_info(fc->root->d_sb)->root;
> struct kernfs_syscall_ops *scops = root->syscall_ops;
>
> - if (scops && scops->remount_fs)
> - return scops->remount_fs(root, flags, data);
> + if (scops && scops->reconfigure)
> + return scops->reconfigure(root, fc);
> return 0;
> }
>
> @@ -61,7 +60,6 @@ const struct super_operations kernfs_sops = {
> .drop_inode = generic_delete_inode,
> .evict_inode = kernfs_evict_inode,
>
> - .remount_fs = kernfs_sop_remount_fs,
> .show_options = kernfs_sop_show_options,
> .show_path = kernfs_sop_show_path,
> };
> @@ -219,7 +217,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
> } while (true);
> }
>
> -static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
> +static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
> {
> struct kernfs_super_info *info = kernfs_info(sb);
> struct inode *inode;
> @@ -230,7 +228,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
> sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
> sb->s_blocksize = PAGE_SIZE;
> sb->s_blocksize_bits = PAGE_SHIFT;
> - sb->s_magic = magic;
> + sb->s_magic = kfc->magic;
> sb->s_op = &kernfs_sops;
> sb->s_xattr = kernfs_xattr_handlers;
> if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
> @@ -257,21 +255,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
> return 0;
> }
>
> -static int kernfs_test_super(struct super_block *sb, void *data)
> +static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
> {
> struct kernfs_super_info *sb_info = kernfs_info(sb);
> - struct kernfs_super_info *info = data;
> + struct kernfs_super_info *info = fc->s_fs_info;
>
> return sb_info->root == info->root && sb_info->ns == info->ns;
> }
>
> -static int kernfs_set_super(struct super_block *sb, void *data)
> +static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
> {
> - int error;
> - error = set_anon_super(sb, data);
> - if (!error)
> - sb->s_fs_info = data;
> - return error;
> + struct kernfs_fs_context *kfc = fc->fs_private;
> +
> + kfc->ns_tag = NULL;
> + return set_anon_super_fc(sb, fc);
> }
>
> /**
> @@ -288,63 +285,60 @@ const void *kernfs_super_ns(struct super_block *sb)
> }
>
> /**
> - * kernfs_mount_ns - kernfs mount helper
> - * @fs_type: file_system_type of the fs being mounted
> - * @flags: mount flags specified for the mount
> - * @root: kernfs_root of the hierarchy being mounted
> - * @magic: file system specific magic number
> - * @new_sb_created: tell the caller if we allocated a new superblock
> - * @ns: optional namespace tag of the mount
> - *
> - * This is to be called from each kernfs user's file_system_type->mount()
> - * implementation, which should pass through the specified @fs_type and
> - * @flags, and specify the hierarchy and namespace tag to mount via @root
> - * and @ns, respectively.
> + * kernfs_get_tree - kernfs filesystem access/retrieval helper
> + * @fc: The filesystem context.
> *
> - * The return value can be passed to the vfs layer verbatim.
> + * This is to be called from each kernfs user's fs_context->ops->get_tree()
> + * implementation, which should set the specified ->@fs_type and ->@flags, and
> + * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
> + * respectively.
> */
> -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
> - struct kernfs_root *root, unsigned long magic,
> - bool *new_sb_created, const void *ns)
> +int kernfs_get_tree(struct fs_context *fc)
> {
> + struct kernfs_fs_context *kfc = fc->fs_private;
> struct super_block *sb;
> struct kernfs_super_info *info;
> int error;
>
> info = kzalloc(sizeof(*info), GFP_KERNEL);
> if (!info)
> - return ERR_PTR(-ENOMEM);
> + return -ENOMEM;
>
> - info->root = root;
> - info->ns = ns;
> + info->root = kfc->root;
> + info->ns = kfc->ns_tag;
> INIT_LIST_HEAD(&info->node);
>
> - sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
> - &init_user_ns, info);
> - if (IS_ERR(sb) || sb->s_fs_info != info)
> - kfree(info);
> + fc->s_fs_info = info;
> + sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
> if (IS_ERR(sb))
> - return ERR_CAST(sb);
> -
> - if (new_sb_created)
> - *new_sb_created = !sb->s_root;
> + return PTR_ERR(sb);
>
> if (!sb->s_root) {
> struct kernfs_super_info *info = kernfs_info(sb);
>
> - error = kernfs_fill_super(sb, magic);
> + kfc->new_sb_created = true;
> +
> + error = kernfs_fill_super(sb, kfc);
> if (error) {
> deactivate_locked_super(sb);
> - return ERR_PTR(error);
> + return error;
> }
> sb->s_flags |= SB_ACTIVE;
>
> mutex_lock(&kernfs_mutex);
> - list_add(&info->node, &root->supers);
> + list_add(&info->node, &info->root->supers);
> mutex_unlock(&kernfs_mutex);
> }
>
> - return dget(sb->s_root);
> + fc->root = dget(sb->s_root);
> + return 0;
> +}
> +
> +void kernfs_free_fs_context(struct fs_context *fc)
> +{
> + /* Note that we don't deal with kfc->ns_tag here. */
> + kfree(fc->s_fs_info);
> + fc->s_fs_info = NULL;
> }
>
> /**
> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
> index 77302c35b0ff..1e1c0ccc6a36 100644
> --- a/fs/sysfs/mount.c
> +++ b/fs/sysfs/mount.c
> @@ -13,6 +13,7 @@
> #include <linux/magic.h>
> #include <linux/mount.h>
> #include <linux/init.h>
> +#include <linux/slab.h>
> #include <linux/user_namespace.h>
>
> #include "sysfs.h"
> @@ -20,27 +21,55 @@
> static struct kernfs_root *sysfs_root;
> struct kernfs_node *sysfs_root_kn;
>
> -static struct dentry *sysfs_mount(struct file_system_type *fs_type,
> - int flags, const char *dev_name, void *data, size_t data_size)
> +static int sysfs_get_tree(struct fs_context *fc)
> {
> - struct dentry *root;
> - void *ns;
> - bool new_sb = false;
> + struct kernfs_fs_context *kfc = fc->fs_private;
> + int ret;
>
> - if (!(flags & SB_KERNMOUNT)) {
> + ret = kernfs_get_tree(fc);
> + if (ret)
> + return ret;
> +
> + if (kfc->new_sb_created)
> + fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
> + return 0;
> +}
> +
> +static void sysfs_fs_context_free(struct fs_context *fc)
> +{
> + struct kernfs_fs_context *kfc = fc->fs_private;
> +
> + if (kfc->ns_tag)
> + kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag);
> + kernfs_free_fs_context(fc);
> + kfree(kfc);
> +}
> +
> +static const struct fs_context_operations sysfs_fs_context_ops = {
> + .free = sysfs_fs_context_free,
> + .get_tree = sysfs_get_tree,
> +};
> +
> +static int sysfs_init_fs_context(struct fs_context *fc,
> + struct dentry *reference)
> +{
> + struct kernfs_fs_context *kfc;
> +
> + if (!(fc->sb_flags & SB_KERNMOUNT)) {
> if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
> - return ERR_PTR(-EPERM);
> + return -EPERM;
> }
>
> - ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
> - root = kernfs_mount_ns(fs_type, flags, sysfs_root,
> - SYSFS_MAGIC, &new_sb, ns);
> - if (!new_sb)
> - kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
> - else if (!IS_ERR(root))
> - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
> + kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL);
> + if (!kfc)
> + return -ENOMEM;
>
> - return root;
> + kfc->ns_tag = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
> + kfc->root = sysfs_root;
> + kfc->magic = SYSFS_MAGIC;
> + fc->fs_private = kfc;
> + fc->ops = &sysfs_fs_context_ops;
> + return 0;
> }
>
> static void sysfs_kill_sb(struct super_block *sb)
> @@ -52,10 +81,10 @@ static void sysfs_kill_sb(struct super_block *sb)
> }
>
> static struct file_system_type sysfs_fs_type = {
> - .name = "sysfs",
> - .mount = sysfs_mount,
> - .kill_sb = sysfs_kill_sb,
> - .fs_flags = FS_USERNS_MOUNT,
> + .name = "sysfs",
> + .init_fs_context = sysfs_init_fs_context,
> + .kill_sb = sysfs_kill_sb,
> + .fs_flags = FS_USERNS_MOUNT,
> };
>
> int __init sysfs_init(void)
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 32c553556bbd..13b6379648ec 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -859,10 +859,11 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
>
> #endif /* !CONFIG_CGROUPS */
>
> -static inline void get_cgroup_ns(struct cgroup_namespace *ns)
> +static inline struct cgroup_namespace *get_cgroup_ns(struct cgroup_namespace *ns)
> {
> if (ns)
> refcount_inc(&ns->count);
> + return ns;
> }
>
> static inline void put_cgroup_ns(struct cgroup_namespace *ns)
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index 0f6bb8e1bc83..051709212f55 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -17,6 +17,7 @@
> #include <linux/atomic.h>
> #include <linux/uidgid.h>
> #include <linux/wait.h>
> +#include <linux/fs_context.h>
>
> struct file;
> struct dentry;
> @@ -27,6 +28,7 @@ struct super_block;
> struct file_system_type;
> struct fs_context;
>
> +struct kernfs_fs_context;
> struct kernfs_open_node;
> struct kernfs_iattrs;
>
> @@ -168,7 +170,7 @@ struct kernfs_node {
> * kernfs_node parameter.
> */
> struct kernfs_syscall_ops {
> - int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
> + int (*reconfigure)(struct kernfs_root *root, struct fs_context *fc);
> int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
>
> int (*mkdir)(struct kernfs_node *parent, const char *name,
> @@ -269,6 +271,18 @@ struct kernfs_ops {
> #endif
> };
>
> +/*
> + * The kernfs superblock creation/mount parameter context.
> + */
> +struct kernfs_fs_context {
> + struct kernfs_root *root; /* Root of the hierarchy being mounted */
> + void *ns_tag; /* Namespace tag of the mount (or NULL) */
> + unsigned long magic; /* File system specific magic number */
> +
> + /* The following are set/used by kernfs_mount() */
> + bool new_sb_created; /* Set to T if we allocated a new sb */
> +};
> +
> #ifdef CONFIG_KERNFS
>
> static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
> @@ -354,9 +368,8 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
> void kernfs_notify(struct kernfs_node *kn);
>
> const void *kernfs_super_ns(struct super_block *sb);
> -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
> - struct kernfs_root *root, unsigned long magic,
> - bool *new_sb_created, const void *ns);
> +int kernfs_get_tree(struct fs_context *fc);
> +void kernfs_free_fs_context(struct fs_context *fc);
> void kernfs_kill_sb(struct super_block *sb);
> struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
> int kernfs_reconfigure(struct fs_context *fc);
> @@ -461,11 +474,10 @@ static inline void kernfs_notify(struct kernfs_node *kn) { }
> static inline const void *kernfs_super_ns(struct super_block *sb)
> { return NULL; }
>
> -static inline struct dentry *
> -kernfs_mount_ns(struct file_system_type *fs_type, int flags,
> - struct kernfs_root *root, unsigned long magic,
> - bool *new_sb_created, const void *ns)
> -{ return ERR_PTR(-ENOSYS); }
> +static inline int kernfs_get_tree(struct fs_context *fc)
> +{ return -ENOSYS; }
> +
> +static inline void kernfs_free_fs_context(struct fs_context *fc) { }
>
> static inline void kernfs_kill_sb(struct super_block *sb) { }
>
> @@ -547,13 +559,4 @@ static inline int kernfs_rename(struct kernfs_node *kn,
> return kernfs_rename_ns(kn, new_parent, new_name, NULL);
> }
>
> -static inline struct dentry *
> -kernfs_mount(struct file_system_type *fs_type, int flags,
> - struct kernfs_root *root, unsigned long magic,
> - bool *new_sb_created)
> -{
> - return kernfs_mount_ns(fs_type, flags, root,
> - magic, new_sb_created, NULL);
> -}
> -
> #endif /* __LINUX_KERNFS_H */
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 75568fcf2180..35012d2aca97 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -34,6 +34,33 @@ extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
> } \
> } while (0)
>
> +/*
> + * The cgroup filesystem superblock creation/mount context.
> + */
> +struct cgroup_fs_context {
> + struct kernfs_fs_context kfc;
> + struct cgroup_root *root;
> + struct cgroup_namespace *ns;
> + u8 version; /* cgroups version */
> + unsigned int flags; /* CGRP_ROOT_* flags */
> +
> + /* cgroup1 bits */
> + bool cpuset_clone_children;
> + bool none; /* User explicitly requested empty subsystem */
> + bool all_ss; /* Seen 'all' option */
> + bool one_ss; /* Seen 'none' option */
> + u16 subsys_mask; /* Selected subsystems */
> + char *name; /* Hierarchy name */
> + char *release_agent; /* Path for release notifications */
> +};
> +
> +static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
> +{
> + struct kernfs_fs_context *kfc = fc->fs_private;
> +
> + return container_of(kfc, struct cgroup_fs_context, kfc);
> +}
> +
> /*
> * A cgroup can be associated with multiple css_sets as different tasks may
> * belong to different cgroups on different hierarchies. In the other
> @@ -115,16 +142,6 @@ struct cgroup_mgctx {
> #define DEFINE_CGROUP_MGCTX(name) \
> struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
>
> -struct cgroup_sb_opts {
> - u16 subsys_mask;
> - unsigned int flags;
> - char *release_agent;
> - bool cpuset_clone_children;
> - char *name;
> - /* User explicitly requested empty subsystem */
> - bool none;
> -};
> -
> extern struct mutex cgroup_mutex;
> extern spinlock_t css_set_lock;
> extern struct cgroup_subsys *cgroup_subsys[];
> @@ -195,12 +212,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
> struct cgroup_namespace *ns);
>
> void cgroup_free_root(struct cgroup_root *root);
> -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
> +void init_cgroup_root(struct cgroup_fs_context *ctx);
> int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
> int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
> -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
> - struct cgroup_root *root, unsigned long magic,
> - struct cgroup_namespace *ns);
> +int cgroup_do_get_tree(struct fs_context *fc);
>
> int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
> void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
> @@ -244,14 +259,15 @@ extern const struct proc_ns_operations cgroupns_operations;
> */
> extern struct cftype cgroup1_base_files[];
> extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
> +extern const struct fs_parameter_description cgroup1_fs_parameters;
>
> int proc_cgroupstats_show(struct seq_file *m, void *v);
> bool cgroup1_ssid_disabled(int ssid);
> void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
> void cgroup1_release_agent(struct work_struct *work);
> void cgroup1_check_for_release(struct cgroup *cgrp);
> -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> - void *data, unsigned long magic,
> - struct cgroup_namespace *ns);
> +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
> +int cgroup1_validate(struct fs_context *fc);
> +int cgroup1_get_tree(struct fs_context *fc);
>
> #endif /* __CGROUP_INTERNAL_H */
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 51063e7a93c2..d8b325c3c2eb 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -13,9 +13,12 @@
> #include <linux/delayacct.h>
> #include <linux/pid_namespace.h>
> #include <linux/cgroupstats.h>
> +#include <linux/fs_parser.h>
>
> #include <trace/events/cgroup.h>
>
> +#define cg_invalf(fc, fmt, ...) ({ pr_err(fmt, ## __VA_ARGS__); -EINVAL; })
> +
> /*
> * pidlists linger the following amount before being destroyed. The goal
> * is avoiding frequent destruction in the middle of consecutive read calls
> @@ -903,92 +906,61 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
> return 0;
> }
>
> -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
> -{
> - char *token, *o = data;
> - bool all_ss = false, one_ss = false;
> - u16 mask = U16_MAX;
> - struct cgroup_subsys *ss;
> - int nr_opts = 0;
> - int i;
> -
> -#ifdef CONFIG_CPUSETS
> - mask = ~((u16)1 << cpuset_cgrp_id);
> -#endif
> +enum cgroup1_param {
> + Opt_all,
> + Opt_clone_children,
> + Opt_cpuset_v2_mode,
> + Opt_name,
> + Opt_none,
> + Opt_noprefix,
> + Opt_release_agent,
> + Opt_xattr,
> + nr__cgroup1_params
> +};
>
> - memset(opts, 0, sizeof(*opts));
> +static const struct fs_parameter_spec cgroup1_param_specs[nr__cgroup1_params] = {
> + [Opt_all] = { fs_param_is_flag },
> + [Opt_clone_children] = { fs_param_is_flag },
> + [Opt_cpuset_v2_mode] = { fs_param_is_flag },
> + [Opt_name] = { fs_param_is_string },
> + [Opt_none] = { fs_param_is_flag },
> + [Opt_noprefix] = { fs_param_is_flag },
> + [Opt_release_agent] = { fs_param_is_string },
> + [Opt_xattr] = { fs_param_is_flag },
> +};
>
> - while ((token = strsep(&o, ",")) != NULL) {
> - nr_opts++;
> +static const char *const cgroup1_param_keys[nr__cgroup1_params] = {
> + [Opt_all] = "all",
> + [Opt_clone_children] = "clone_children",
> + [Opt_cpuset_v2_mode] = "cpuset_v2_mode",
> + [Opt_name] = "name",
> + [Opt_none] = "none",
> + [Opt_noprefix] = "noprefix",
> + [Opt_release_agent] = "release_agent",
> + [Opt_xattr] = "xattr",
> +};
>
> - if (!*token)
> - return -EINVAL;
> - if (!strcmp(token, "none")) {
> - /* Explicitly have no subsystems */
> - opts->none = true;
> - continue;
> - }
> - if (!strcmp(token, "all")) {
> - /* Mutually exclusive option 'all' + subsystem name */
> - if (one_ss)
> - return -EINVAL;
> - all_ss = true;
> - continue;
> - }
> - if (!strcmp(token, "noprefix")) {
> - opts->flags |= CGRP_ROOT_NOPREFIX;
> - continue;
> - }
> - if (!strcmp(token, "clone_children")) {
> - opts->cpuset_clone_children = true;
> - continue;
> - }
> - if (!strcmp(token, "cpuset_v2_mode")) {
> - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
> - continue;
> - }
> - if (!strcmp(token, "xattr")) {
> - opts->flags |= CGRP_ROOT_XATTR;
> - continue;
> - }
> - if (!strncmp(token, "release_agent=", 14)) {
> - /* Specifying two release agents is forbidden */
> - if (opts->release_agent)
> - return -EINVAL;
> - opts->release_agent =
> - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
> - if (!opts->release_agent)
> - return -ENOMEM;
> - continue;
> - }
> - if (!strncmp(token, "name=", 5)) {
> - const char *name = token + 5;
> - /* Can't specify an empty name */
> - if (!strlen(name))
> - return -EINVAL;
> - /* Must match [\w.-]+ */
> - for (i = 0; i < strlen(name); i++) {
> - char c = name[i];
> - if (isalnum(c))
> - continue;
> - if ((c == '.') || (c == '-') || (c == '_'))
> - continue;
> - return -EINVAL;
> - }
> - /* Specifying two names is forbidden */
> - if (opts->name)
> - return -EINVAL;
> - opts->name = kstrndup(name,
> - MAX_CGROUP_ROOT_NAMELEN - 1,
> - GFP_KERNEL);
> - if (!opts->name)
> - return -ENOMEM;
> +const struct fs_parameter_description cgroup1_fs_parameters = {
> + .name = "cgroup1",
> + .nr_params = nr__cgroup1_params,
> + .keys = cgroup1_param_keys,
> + .specs = cgroup1_param_specs,
> + .no_source = true,
> +};
>
> - continue;
> - }
> +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
> +{
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> + struct cgroup_subsys *ss;
> + struct fs_parse_result result;
> + int opt, i;
>
> + opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
> + if (opt == -ENOPARAM) {
> + if (strcmp(param->key, "source") == 0)
> + return 0;
> for_each_subsys(ss, i) {
> - if (strcmp(token, ss->legacy_name))
> + if (strcmp(param->key, ss->legacy_name) != 0)
> continue;
> if (!cgroup_ssid_enabled(i))
> continue;
> @@ -996,75 +968,144 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
> continue;
>
> /* Mutually exclusive option 'all' + subsystem name */
> - if (all_ss)
> - return -EINVAL;
> - opts->subsys_mask |= (1 << i);
> - one_ss = true;
> + if (ctx->all_ss)
> + return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
> + ctx->subsys_mask |= (1 << i);
> + ctx->one_ss = true;
> + return 0;
> + }
>
> - break;
> + return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
> + }
> + if (opt < 0)
> + return opt;
> +
> + switch (opt) {
> + case Opt_none:
> + /* Explicitly have no subsystems */
> + ctx->none = true;
> + return 0;
> + case Opt_all:
> + /* Mutually exclusive option 'all' + subsystem name */
> + if (ctx->one_ss)
> + return cg_invalf(fc, "cgroup1: all conflicts with subsys name");
> + ctx->all_ss = true;
> + return 0;
> + case Opt_noprefix:
> + ctx->flags |= CGRP_ROOT_NOPREFIX;
> + return 0;
> + case Opt_clone_children:
> + ctx->cpuset_clone_children = true;
> + return 0;
> + case Opt_cpuset_v2_mode:
> + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
> + return 0;
> + case Opt_xattr:
> + ctx->flags |= CGRP_ROOT_XATTR;
> + return 0;
> + case Opt_release_agent:
> + /* Specifying two release agents is forbidden */
> + if (ctx->release_agent)
> + return cg_invalf(fc, "cgroup1: release_agent respecified");
> + ctx->release_agent = param->string;
> + param->string = NULL;
> + if (!ctx->release_agent)
> + return -ENOMEM;
> + return 0;
> +
> + case Opt_name:
> + /* Can't specify an empty name */
> + if (!param->size)
> + return cg_invalf(fc, "cgroup1: Empty name");
> + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
> + return cg_invalf(fc, "cgroup1: Name too long");
> + /* Must match [\w.-]+ */
> + for (i = 0; i < param->size; i++) {
> + char c = param->string[i];
> + if (isalnum(c))
> + continue;
> + if ((c == '.') || (c == '-') || (c == '_'))
> + continue;
> + return cg_invalf(fc, "cgroup1: Invalid name");
> }
> - if (i == CGROUP_SUBSYS_COUNT)
> - return -ENOENT;
> + /* Specifying two names is forbidden */
> + if (ctx->name)
> + return cg_invalf(fc, "cgroup1: name respecified");
> + ctx->name = param->string;
> + param->string = NULL;
> + return 0;
> }
>
> + return 0;
> +}
> +
> +/*
> + * Validate the options that have been parsed.
> + */
> +int cgroup1_validate(struct fs_context *fc)
> +{
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> + struct cgroup_subsys *ss;
> + u16 mask = U16_MAX;
> + int i;
> +
> +#ifdef CONFIG_CPUSETS
> + mask = ~((u16)1 << cpuset_cgrp_id);
> +#endif
> +
> /*
> * If the 'all' option was specified select all the subsystems,
> * otherwise if 'none', 'name=' and a subsystem name options were
> * not specified, let's default to 'all'
> */
> - if (all_ss || (!one_ss && !opts->none && !opts->name))
> + if (ctx->all_ss || (!ctx->one_ss && !ctx->none && !ctx->name))
> for_each_subsys(ss, i)
> if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
> - opts->subsys_mask |= (1 << i);
> + ctx->subsys_mask |= (1 << i);
>
> /*
> * We either have to specify by name or by subsystems. (So all
> * empty hierarchies must have a name).
> */
> - if (!opts->subsys_mask && !opts->name)
> - return -EINVAL;
> + if (!ctx->subsys_mask && !ctx->name)
> + return cg_invalf(fc, "cgroup1: Need name or subsystem set");
>
> /*
> * Option noprefix was introduced just for backward compatibility
> * with the old cpuset, so we allow noprefix only if mounting just
> * the cpuset subsystem.
> */
> - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
> - return -EINVAL;
> + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
> + return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
>
> /* Can't specify "none" and some subsystems */
> - if (opts->subsys_mask && opts->none)
> - return -EINVAL;
> + if (ctx->subsys_mask && ctx->none)
> + return cg_invalf(fc, "cgroup1: none used incorrectly");
>
> return 0;
> }
>
> -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
> +static int cgroup1_reconfigure(struct kernfs_root *kf_root, struct fs_context *fc)
> {
> - int ret = 0;
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> struct cgroup_root *root = cgroup_root_from_kf(kf_root);
> - struct cgroup_sb_opts opts;
> u16 added_mask, removed_mask;
> + int ret = 0;
>
> cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
>
> - /* See what subsystems are wanted */
> - ret = parse_cgroupfs_options(data, &opts);
> - if (ret)
> - goto out_unlock;
> -
> - if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
> + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
> pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
> task_tgid_nr(current), current->comm);
>
> - added_mask = opts.subsys_mask & ~root->subsys_mask;
> - removed_mask = root->subsys_mask & ~opts.subsys_mask;
> + added_mask = ctx->subsys_mask & ~root->subsys_mask;
> + removed_mask = root->subsys_mask & ~ctx->subsys_mask;
>
> /* Don't allow flags or name to change at remount */
> - if ((opts.flags ^ root->flags) ||
> - (opts.name && strcmp(opts.name, root->name))) {
> - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
> - opts.flags, opts.name ?: "", root->flags, root->name);
> + if ((ctx->flags ^ root->flags) ||
> + (ctx->name && strcmp(ctx->name, root->name))) {
> + cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
> + ctx->flags, ctx->name ?: "", root->flags, root->name);
> ret = -EINVAL;
> goto out_unlock;
> }
> @@ -1081,17 +1122,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
>
> WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
>
> - if (opts.release_agent) {
> + if (ctx->release_agent) {
> spin_lock(&release_agent_path_lock);
> - strcpy(root->release_agent_path, opts.release_agent);
> + strcpy(root->release_agent_path, ctx->release_agent);
> spin_unlock(&release_agent_path_lock);
> }
>
> trace_cgroup_remount(root);
>
> out_unlock:
> - kfree(opts.release_agent);
> - kfree(opts.name);
> mutex_unlock(&cgroup_mutex);
> return ret;
> }
> @@ -1099,31 +1138,26 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
> struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
> .rename = cgroup1_rename,
> .show_options = cgroup1_show_options,
> - .remount_fs = cgroup1_remount,
> + .reconfigure = cgroup1_reconfigure,
> .mkdir = cgroup_mkdir,
> .rmdir = cgroup_rmdir,
> .show_path = cgroup_show_path,
> };
>
> -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> - void *data, unsigned long magic,
> - struct cgroup_namespace *ns)
> +/*
> + * Find or create a v1 cgroups superblock.
> + */
> +int cgroup1_get_tree(struct fs_context *fc)
> {
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> struct super_block *pinned_sb = NULL;
> - struct cgroup_sb_opts opts;
> struct cgroup_root *root;
> struct cgroup_subsys *ss;
> - struct dentry *dentry;
> int i, ret;
> bool new_root = false;
>
> cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
>
> - /* First find the desired set of subsystems */
> - ret = parse_cgroupfs_options(data, &opts);
> - if (ret)
> - goto out_unlock;
> -
> /*
> * Destruction of cgroup root is asynchronous, so subsystems may
> * still be dying after the previous unmount. Let's drain the
> @@ -1132,15 +1166,13 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> * starting. Testing ref liveliness is good enough.
> */
> for_each_subsys(ss, i) {
> - if (!(opts.subsys_mask & (1 << i)) ||
> + if (!(ctx->subsys_mask & (1 << i)) ||
> ss->root == &cgrp_dfl_root)
> continue;
>
> if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
> mutex_unlock(&cgroup_mutex);
> - msleep(10);
> - ret = restart_syscall();
> - goto out_free;
> + goto err_restart;
> }
> cgroup_put(&ss->root->cgrp);
> }
> @@ -1156,8 +1188,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> * name matches but sybsys_mask doesn't, we should fail.
> * Remember whether name matched.
> */
> - if (opts.name) {
> - if (strcmp(opts.name, root->name))
> + if (ctx->name) {
> + if (strcmp(ctx->name, root->name))
> continue;
> name_match = true;
> }
> @@ -1166,15 +1198,15 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> * If we asked for subsystems (or explicitly for no
> * subsystems) then they must match.
> */
> - if ((opts.subsys_mask || opts.none) &&
> - (opts.subsys_mask != root->subsys_mask)) {
> + if ((ctx->subsys_mask || ctx->none) &&
> + (ctx->subsys_mask != root->subsys_mask)) {
> if (!name_match)
> continue;
> ret = -EBUSY;
> - goto out_unlock;
> + goto err_unlock;
> }
>
> - if (root->flags ^ opts.flags)
> + if (root->flags ^ ctx->flags)
> pr_warn("new mount options do not match the existing superblock, will be ignored\n");
>
> /*
> @@ -1195,11 +1227,10 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> mutex_unlock(&cgroup_mutex);
> if (!IS_ERR_OR_NULL(pinned_sb))
> deactivate_super(pinned_sb);
> - msleep(10);
> - ret = restart_syscall();
> - goto out_free;
> + goto err_restart;
> }
>
> + ctx->root = root;
> ret = 0;
> goto out_unlock;
> }
> @@ -1209,41 +1240,35 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> * specification is allowed for already existing hierarchies but we
> * can't create new one without subsys specification.
> */
> - if (!opts.subsys_mask && !opts.none) {
> - ret = -EINVAL;
> - goto out_unlock;
> + if (!ctx->subsys_mask && !ctx->none) {
> + ret = cg_invalf(fc, "cgroup1: No subsys list or none specified");
> + goto err_unlock;
> }
>
> /* Hierarchies may only be created in the initial cgroup namespace. */
> - if (ns != &init_cgroup_ns) {
> + if (ctx->ns != &init_cgroup_ns) {
> ret = -EPERM;
> - goto out_unlock;
> + goto err_unlock;
> }
>
> root = kzalloc(sizeof(*root), GFP_KERNEL);
> if (!root) {
> ret = -ENOMEM;
> - goto out_unlock;
> + goto err_unlock;
> }
> new_root = true;
> + ctx->root = root;
>
> - init_cgroup_root(root, &opts);
> + init_cgroup_root(ctx);
>
> - ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
> + ret = cgroup_setup_root(root, ctx->subsys_mask, PERCPU_REF_INIT_DEAD);
> if (ret)
> - cgroup_free_root(root);
> + goto err_unlock;
>
> out_unlock:
> mutex_unlock(&cgroup_mutex);
> -out_free:
> - kfree(opts.release_agent);
> - kfree(opts.name);
> -
> - if (ret)
> - return ERR_PTR(ret);
>
> - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
> - CGROUP_SUPER_MAGIC, ns);
> + ret = cgroup_do_get_tree(fc);
>
> /*
> * There's a race window after we release cgroup_mutex and before
> @@ -1256,6 +1281,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> percpu_ref_reinit(&root->cgrp.self.refcnt);
> mutex_unlock(&cgroup_mutex);
> }
> + cgroup_get(&root->cgrp);
>
> /*
> * If @pinned_sb, we're reusing an existing root and holding an
> @@ -1264,7 +1290,14 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
> if (pinned_sb)
> deactivate_super(pinned_sb);
>
> - return dentry;
> + return ret;
> +
> +err_restart:
> + msleep(10);
> + return restart_syscall();
> +err_unlock:
> + mutex_unlock(&cgroup_mutex);
> + return ret;
> }
>
> static int __init cgroup1_wq_init(void)
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 48dbf249bec5..3c3c40cad257 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -54,6 +54,7 @@
> #include <linux/proc_ns.h>
> #include <linux/nsproxy.h>
> #include <linux/file.h>
> +#include <linux/fs_parser.h>
> #include <linux/sched/cputime.h>
> #include <net/sock.h>
>
> @@ -1737,25 +1738,51 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
> return len;
> }
>
> -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
> -{
> - char *token;
> +enum cgroup2_param {
> + Opt_nsdelegate,
> + nr__cgroup2_params
> +};
>
> - *root_flags = 0;
> +static const struct fs_parameter_spec cgroup2_param_specs[nr__cgroup2_params] = {
> + [Opt_nsdelegate] = { fs_param_is_flag },
> +};
>
> - if (!data)
> - return 0;
> +static const char *const cgroup2_param_keys[nr__cgroup2_params] = {
> + [Opt_nsdelegate] = "nsdelegate",
> +};
>
> - while ((token = strsep(&data, ",")) != NULL) {
> - if (!strcmp(token, "nsdelegate")) {
> - *root_flags |= CGRP_ROOT_NS_DELEGATE;
> - continue;
> - }
> +static const struct fs_parameter_description cgroup2_fs_parameters = {
> + .name = "cgroup2",
> + .nr_params = nr__cgroup2_params,
> + .keys = cgroup2_param_keys,
> + .specs = cgroup2_param_specs,
> + .no_source = true,
> +};
>
> - pr_err("cgroup2: unknown option \"%s\"\n", token);
> - return -EINVAL;
> +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
> +{
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> + struct fs_parse_result result;
> + int opt;
> +
> + opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
> + if (opt < 0)
> + return opt;
> +
> + switch (opt) {
> + case Opt_nsdelegate:
> + ctx->flags |= CGRP_ROOT_NS_DELEGATE;
> + return 0;
> }
>
> + return -EINVAL;
> +}
> +
> +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
> +{
> + if (current->nsproxy->cgroup_ns == &init_cgroup_ns &&
> + cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
> + seq_puts(seq, ",nsdelegate");
> return 0;
> }
>
> @@ -1769,23 +1796,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
> }
> }
>
> -static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
> -{
> - if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
> - seq_puts(seq, ",nsdelegate");
> - return 0;
> -}
> -
> -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
> +static int cgroup_reconfigure(struct kernfs_root *kf_root, struct fs_context *fc)
> {
> - unsigned int root_flags;
> - int ret;
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
>
> - ret = parse_cgroup_root_flags(data, &root_flags);
> - if (ret)
> - return ret;
> -
> - apply_cgroup_root_flags(root_flags);
> + apply_cgroup_root_flags(ctx->flags);
> return 0;
> }
>
> @@ -1873,8 +1888,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
> }
>
> -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
> +void init_cgroup_root(struct cgroup_fs_context *ctx)
> {
> + struct cgroup_root *root = ctx->root;
> struct cgroup *cgrp = &root->cgrp;
>
> INIT_LIST_HEAD(&root->root_list);
> @@ -1883,12 +1899,12 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
> init_cgroup_housekeeping(cgrp);
> idr_init(&root->cgroup_idr);
>
> - root->flags = opts->flags;
> - if (opts->release_agent)
> - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
> - if (opts->name)
> - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
> - if (opts->cpuset_clone_children)
> + root->flags = ctx->flags;
> + if (ctx->release_agent)
> + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
> + if (ctx->name)
> + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
> + if (ctx->cpuset_clone_children)
> set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
> }
>
> @@ -1993,57 +2009,53 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
> return ret;
> }
>
> -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
> - struct cgroup_root *root, unsigned long magic,
> - struct cgroup_namespace *ns)
> +int cgroup_do_get_tree(struct fs_context *fc)
> {
> - struct dentry *dentry;
> - bool new_sb;
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> + int ret;
>
> - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
> + ctx->kfc.root = ctx->root->kf_root;
> +
> + ret = kernfs_get_tree(fc);
> + if (ret < 0)
> + goto out_cgrp;
>
> /*
> * In non-init cgroup namespace, instead of root cgroup's dentry,
> * we return the dentry corresponding to the cgroupns->root_cgrp.
> */
> - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
> + if (ctx->ns != &init_cgroup_ns) {
> struct dentry *nsdentry;
> struct cgroup *cgrp;
>
> mutex_lock(&cgroup_mutex);
> spin_lock_irq(&css_set_lock);
>
> - cgrp = cset_cgroup_from_root(ns->root_cset, root);
> + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
>
> spin_unlock_irq(&css_set_lock);
> mutex_unlock(&cgroup_mutex);
>
> - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
> - dput(dentry);
> - dentry = nsdentry;
> + nsdentry = kernfs_node_dentry(cgrp->kn, fc->root->d_sb);
> + if (IS_ERR(nsdentry))
> + return PTR_ERR(nsdentry);
> + dput(fc->root);
> + fc->root = nsdentry;
> }
>
> - if (IS_ERR(dentry) || !new_sb)
> - cgroup_put(&root->cgrp);

I don't see where this cgroup_put() has been moved.

With this patch, the next script works only once, on the second attempt
it hangs up on mounting a cgroup file system.

This is the only suspicious place in this patch what I have found.

[root@fc24 ~]# cat fs-vs-cg
d=$(mktemp -d /tmp/cg.XXXXXX)
mkdir $d/a
mkdir $d/b
mount -t cgroup -o none,name=xxxx xxx $d/a
mount -t cgroup -o none,name=xxxx xxx $d/b
umount $d/a
umount $d/b

[root@fc24 ~]# unshare -m --propagation private bash -x fs-vs-cg
++ mktemp -d /tmp/cg.XXXXXX
+ d=/tmp/cg.yUfagS
+ mkdir /tmp/cg.yUfagS/a
+ mkdir /tmp/cg.yUfagS/b
+ mount -t cgroup -o none,name=xxxx xxx /tmp/cg.yUfagS/a
+ mount -t cgroup -o none,name=xxxx xxx /tmp/cg.yUfagS/b
+ umount /tmp/cg.yUfagS/a
+ umount /tmp/cg.yUfagS/b
[root@fc24 ~]# unshare -m --propagation private bash -x fs-vs-cg
++ mktemp -d /tmp/cg.XXXXXX
+ d=/tmp/cg.ippWUn
+ mkdir /tmp/cg.ippWUn/a
+ mkdir /tmp/cg.ippWUn/b
+ mount -t cgroup -o none,name=xxxx xxx /tmp/cg.ippWUn/a
^Z
[1]+ Stopped unshare -m --propagation private bash -x fs-vs-cg

[root@fc24 ~]# ps
PID TTY TIME CMD
556 pts/0 00:00:00 bash
591 pts/0 00:00:00 bash
595 pts/0 00:00:00 mount
596 pts/0 00:00:00 ps

[root@fc24 ~]# bg
[1]+ unshare -m --propagation private bash -x fs-vs-cg &

[root@fc24 ~]# cat /proc/595/stack
[<0>] msleep+0x38/0x40
[<0>] cgroup1_get_tree+0x4e1/0x72c
[<0>] vfs_get_tree+0x5e/0x140
[<0>] do_mount+0x326/0xc70
[<0>] ksys_mount+0xba/0xd0
[<0>] __x64_sys_mount+0x21/0x30
[<0>] do_syscall_64+0x60/0x210
[<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[<0>] 0xffffffffffffffff

> + ret = 0;
> + if (ctx->kfc.new_sb_created)
> + goto out_cgrp;
> + apply_cgroup_root_flags(ctx->flags);
> + return 0;
>
> - return dentry;
> +out_cgrp:
> + return ret;
> }
>
> -static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> - int flags, const char *unused_dev_name,
> - void *data, size_t data_size)
> +static int cgroup_get_tree(struct fs_context *fc)
> {
> - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
> - struct dentry *dentry;
> - int ret;
> -
> - get_cgroup_ns(ns);
> -
> - /* Check if the caller has permission to mount. */
> - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
> - put_cgroup_ns(ns);
> - return ERR_PTR(-EPERM);
> - }
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
>
> /*
> * The first time anyone tries to mount a cgroup, enable the list
> @@ -2052,29 +2064,96 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> if (!use_task_css_set_links)
> cgroup_enable_task_cg_lists();
>
> - if (fs_type == &cgroup2_fs_type) {
> - unsigned int root_flags;
> -
> - ret = parse_cgroup_root_flags(data, &root_flags);
> - if (ret) {
> - put_cgroup_ns(ns);
> - return ERR_PTR(ret);
> - }
> + switch (ctx->version) {
> + case 1:
> + return cgroup1_get_tree(fc);
>
> + case 2:
> cgrp_dfl_visible = true;
> cgroup_get_live(&cgrp_dfl_root.cgrp);
>
> - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
> - CGROUP2_SUPER_MAGIC, ns);
> - if (!IS_ERR(dentry))
> - apply_cgroup_root_flags(root_flags);
> - } else {
> - dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
> - CGROUP_SUPER_MAGIC, ns);
> + ctx->root = &cgrp_dfl_root;
> + return cgroup_do_get_tree(fc);
> +
> + default:
> + BUG();
> + }
> +}
> +
> +static int cgroup_parse_param(struct fs_context *fc, struct fs_parameter *param)
> +{
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> +
> + if (ctx->version == 1)
> + return cgroup1_parse_param(fc, param);
> +
> + return cgroup2_parse_param(fc, param);
> +}
> +
> +static int cgroup_validate(struct fs_context *fc)
> +{
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> +
> + if (ctx->version == 1)
> + return cgroup1_validate(fc);
> + return 0;
> +}
> +
> +/*
> + * Destroy a cgroup filesystem context.
> + */
> +static void cgroup_fs_context_free(struct fs_context *fc)
> +{
> + struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> +
> + kfree(ctx->name);
> + kfree(ctx->release_agent);
> + if (ctx->root)
> + cgroup_put(&ctx->root->cgrp);
> + put_cgroup_ns(ctx->ns);
> + kernfs_free_fs_context(fc);
> + kfree(ctx);
> +}
> +
> +static const struct fs_context_operations cgroup_fs_context_ops = {
> + .free = cgroup_fs_context_free,
> + .parse_param = cgroup_parse_param,
> + .validate = cgroup_validate,
> + .get_tree = cgroup_get_tree,
> + .reconfigure = kernfs_reconfigure,
> +};
> +
> +/*
> + * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
> + * we select the namespace we're going to use.
> + */
> +static int cgroup_init_fs_context(struct fs_context *fc, struct dentry *reference)
> +{
> + struct cgroup_fs_context *ctx;
> + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
> +
> + switch (fc->purpose) {
> + case FS_CONTEXT_FOR_UMOUNT:
> + case FS_CONTEXT_FOR_EMERGENCY_RO:
> + return -EOPNOTSUPP;
> + default:
> + break;
> }
>
> - put_cgroup_ns(ns);
> - return dentry;
> + /* Check if the caller has permission to mount. */
> + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
> + if (!ctx)
> + return -ENOMEM;
> +
> + ctx->ns = get_cgroup_ns(ns);
> + ctx->version = (fc->fs_type == &cgroup2_fs_type) ? 2 : 1;
> + ctx->kfc.magic = (ctx->version == 2) ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC;
> + fc->fs_private = &ctx->kfc;
> + fc->ops = &cgroup_fs_context_ops;
> + return 0;
> }
>
> static void cgroup_kill_sb(struct super_block *sb)
> @@ -2099,17 +2178,19 @@ static void cgroup_kill_sb(struct super_block *sb)
> }
>
> struct file_system_type cgroup_fs_type = {
> - .name = "cgroup",
> - .mount = cgroup_mount,
> - .kill_sb = cgroup_kill_sb,
> - .fs_flags = FS_USERNS_MOUNT,
> + .name = "cgroup",
> + .init_fs_context = cgroup_init_fs_context,
> + .parameters = &cgroup1_fs_parameters,
> + .kill_sb = cgroup_kill_sb,
> + .fs_flags = FS_USERNS_MOUNT,
> };
>
> static struct file_system_type cgroup2_fs_type = {
> - .name = "cgroup2",
> - .mount = cgroup_mount,
> - .kill_sb = cgroup_kill_sb,
> - .fs_flags = FS_USERNS_MOUNT,
> + .name = "cgroup2",
> + .init_fs_context = cgroup_init_fs_context,
> + .parameters = &cgroup2_fs_parameters,
> + .kill_sb = cgroup_kill_sb,
> + .fs_flags = FS_USERNS_MOUNT,
> };
>
> int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
> @@ -5179,7 +5260,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
>
> static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
> .show_options = cgroup_show_options,
> - .remount_fs = cgroup_remount,
> + .reconfigure = cgroup_reconfigure,
> .mkdir = cgroup_mkdir,
> .rmdir = cgroup_rmdir,
> .show_path = cgroup_show_path,
> @@ -5246,11 +5327,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
> */
> int __init cgroup_init_early(void)
> {
> - static struct cgroup_sb_opts __initdata opts;
> + static struct cgroup_fs_context __initdata ctx;
> struct cgroup_subsys *ss;
> int i;
>
> - init_cgroup_root(&cgrp_dfl_root, &opts);
> + ctx.root = &cgrp_dfl_root;
> + init_cgroup_root(&ctx);
> cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
>
> RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index df78e166028c..b4ad1a52f006 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -324,10 +324,8 @@ static int cpuset_get_tree(struct fs_context *fc)
> int ret = -ENODEV;
>
> cgroup_fs = get_fs_type("cgroup");
> - if (cgroup_fs) {
> - ret = PTR_ERR(cgroup_fs);
> + if (!cgroup_fs)
> goto out;
> - }
>
> cg_fc = vfs_new_fs_context(cgroup_fs, NULL, fc->sb_flags, fc->sb_flags,
> fc->purpose);