Re: [PATCHv1 5/8] cgroup: introduce cgroup namespaces

From: Serge E. Hallyn
Date: Thu Oct 16 2014 - 12:37:18 EST


Quoting Aditya Kali (adityakali@xxxxxxxxxx):
> Introduce the ability to create new cgroup namespace. The newly created
> cgroup namespace remembers the 'struct cgroup *root_cgrp' at the point
> of creation of the cgroup namespace. The task that creates the new
> cgroup namespace and all its future children will now be restricted only
> to the cgroup hierarchy under this root_cgrp.
> The main purpose of cgroup namespace is to virtualize the contents
> of /proc/self/cgroup file. Processes inside a cgroup namespace
> are only able to see paths relative to their namespace root.
> This allows container-tools (like libcontainer, lxc, lmctfy, etc.)
> to create completely virtualized containers without leaking system
> level cgroup hierarchy to the task.
> This patch only implements the 'unshare' part of the cgroupns.
>
> Signed-off-by: Aditya Kali <adityakali@xxxxxxxxxx>

I'm not sure that the CONFIG_CGROUP_NS is worthwhile. If you already
have cgroups in the kernel this won't add much in the way of memory
usage, right? And I think the 'experimental' argument has long since
been squashed. So I'd argue for simplifying this patch by removing
CONFIG_CGROUP_NS.

(more below)

> ---
> fs/proc/namespaces.c | 3 +
> include/linux/cgroup.h | 18 +++++-
> include/linux/cgroup_namespace.h | 62 +++++++++++++++++++
> include/linux/nsproxy.h | 2 +
> include/linux/proc_ns.h | 4 ++
> init/Kconfig | 9 +++
> kernel/Makefile | 1 +
> kernel/cgroup.c | 11 ++++
> kernel/cgroup_namespace.c | 128 +++++++++++++++++++++++++++++++++++++++
> kernel/fork.c | 2 +-
> kernel/nsproxy.c | 19 +++++-
> 11 files changed, 255 insertions(+), 4 deletions(-)
>
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index 8902609..e04ed4b 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -32,6 +32,9 @@ static const struct proc_ns_operations *ns_entries[] = {
> &userns_operations,
> #endif
> &mntns_operations,
> +#ifdef CONFIG_CGROUP_NS
> + &cgroupns_operations,
> +#endif
> };
>
> static const struct file_operations ns_file_operations = {
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 4a0eb2d..aa86495 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -22,6 +22,8 @@
> #include <linux/seq_file.h>
> #include <linux/kernfs.h>
> #include <linux/wait.h>
> +#include <linux/nsproxy.h>
> +#include <linux/types.h>
>
> #ifdef CONFIG_CGROUPS
>
> @@ -460,6 +462,13 @@ struct cftype {
> #endif
> };
>
> +struct cgroup_namespace {
> + atomic_t count;
> + unsigned int proc_inum;
> + struct user_namespace *user_ns;
> + struct cgroup *root_cgrp;
> +};
> +
> extern struct cgroup_root cgrp_dfl_root;
> extern struct css_set init_css_set;
>
> @@ -584,10 +593,17 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
> return kernfs_name(cgrp->kn, buf, buflen);
> }
>
> +static inline char * __must_check cgroup_path_ns(struct cgroup_namespace *ns,
> + struct cgroup *cgrp, char *buf,
> + size_t buflen)
> +{
> + return kernfs_path_from_node(ns->root_cgrp->kn, cgrp->kn, buf, buflen);
> +}
> +
> static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
> size_t buflen)
> {
> - return kernfs_path(cgrp->kn, buf, buflen);
> + return cgroup_path_ns(current->nsproxy->cgroup_ns, cgrp, buf, buflen);
> }
>
> static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
> diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h
> new file mode 100644
> index 0000000..9f637fe
> --- /dev/null
> +++ b/include/linux/cgroup_namespace.h
> @@ -0,0 +1,62 @@
> +#ifndef _LINUX_CGROUP_NAMESPACE_H
> +#define _LINUX_CGROUP_NAMESPACE_H
> +
> +#include <linux/nsproxy.h>
> +#include <linux/cgroup.h>
> +#include <linux/types.h>
> +#include <linux/user_namespace.h>
> +
> +extern struct cgroup_namespace init_cgroup_ns;
> +
> +static inline struct cgroup *task_cgroupns_root(struct task_struct *tsk)
> +{
> + return tsk->nsproxy->cgroup_ns->root_cgrp;

Per the rules in nsproxy.h, you should be taking the task_lock here.

(If you are making assumptions about tsk then you need to state them
here - I only looked quickly enough that you pass in 'leader')

> +}
> +
> +#ifdef CONFIG_CGROUP_NS
> +
> +extern void free_cgroup_ns(struct cgroup_namespace *ns);
> +
> +static inline struct cgroup_namespace *get_cgroup_ns(
> + struct cgroup_namespace *ns)
> +{
> + if (ns)
> + atomic_inc(&ns->count);
> + return ns;
> +}
> +
> +static inline void put_cgroup_ns(struct cgroup_namespace *ns)
> +{
> + if (ns && atomic_dec_and_test(&ns->count))
> + free_cgroup_ns(ns);
> +}
> +
> +extern struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
> + struct user_namespace *user_ns,
> + struct cgroup_namespace *old_ns);
> +
> +#else /* CONFIG_CGROUP_NS */
> +
> +static inline struct cgroup_namespace *get_cgroup_ns(
> + struct cgroup_namespace *ns)
> +{
> + return &init_cgroup_ns;
> +}
> +
> +static inline void put_cgroup_ns(struct cgroup_namespace *ns)
> +{
> +}
> +
> +static inline struct cgroup_namespace *copy_cgroup_ns(
> + unsigned long flags,
> + struct user_namespace *user_ns,
> + struct cgroup_namespace *old_ns) {
> + if (flags & CLONE_NEWCGROUP)
> + return ERR_PTR(-EINVAL);
> +
> + return old_ns;
> +}
> +
> +#endif /* CONFIG_CGROUP_NS */
> +
> +#endif /* _LINUX_CGROUP_NAMESPACE_H */
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 35fa08f..ac0d65b 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -8,6 +8,7 @@ struct mnt_namespace;
> struct uts_namespace;
> struct ipc_namespace;
> struct pid_namespace;
> +struct cgroup_namespace;
> struct fs_struct;
>
> /*
> @@ -33,6 +34,7 @@ struct nsproxy {
> struct mnt_namespace *mnt_ns;
> struct pid_namespace *pid_ns_for_children;
> struct net *net_ns;
> + struct cgroup_namespace *cgroup_ns;
> };
> extern struct nsproxy init_nsproxy;
>
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index 34a1e10..e56dd73 100644
> --- a/include/linux/proc_ns.h
> +++ b/include/linux/proc_ns.h
> @@ -6,6 +6,8 @@
>
> struct pid_namespace;
> struct nsproxy;
> +struct task_struct;
> +struct inode;
>
> struct proc_ns_operations {
> const char *name;
> @@ -27,6 +29,7 @@ extern const struct proc_ns_operations ipcns_operations;
> extern const struct proc_ns_operations pidns_operations;
> extern const struct proc_ns_operations userns_operations;
> extern const struct proc_ns_operations mntns_operations;
> +extern const struct proc_ns_operations cgroupns_operations;
>
> /*
> * We always define these enumerators
> @@ -37,6 +40,7 @@ enum {
> PROC_UTS_INIT_INO = 0xEFFFFFFEU,
> PROC_USER_INIT_INO = 0xEFFFFFFDU,
> PROC_PID_INIT_INO = 0xEFFFFFFCU,
> + PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
> };
>
> #ifdef CONFIG_PROC_FS
> diff --git a/init/Kconfig b/init/Kconfig
> index e84c642..c3be001 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1144,6 +1144,15 @@ config DEBUG_BLK_CGROUP
> Enable some debugging help. Currently it exports additional stat
> files in a cgroup which can be useful for debugging.
>
> +config CGROUP_NS
> + bool "CGroup Namespaces"
> + default n
> + help
> + This options enables CGroup Namespaces which can be used to isolate
> + cgroup paths. This feature is only useful when unified cgroup
> + hierarchy is in use (i.e. cgroups are mounted with sane_behavior
> + option).
> +
> endif # CGROUPS
>
> config CHECKPOINT_RESTORE
> diff --git a/kernel/Makefile b/kernel/Makefile
> index dc5c775..75334f8 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
> obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
> obj-$(CONFIG_COMPAT) += compat.o
> obj-$(CONFIG_CGROUPS) += cgroup.o
> +obj-$(CONFIG_CGROUP_NS) += cgroup_namespace.o
> obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
> obj-$(CONFIG_CPUSETS) += cpuset.o
> obj-$(CONFIG_UTS_NS) += utsname.o
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 2b3e9f9..f8099b4 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -57,6 +57,8 @@
> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
> #include <linux/kthread.h>
> #include <linux/delay.h>
> +#include <linux/proc_ns.h>
> +#include <linux/cgroup_namespace.h>
>
> #include <linux/atomic.h>
>
> @@ -195,6 +197,15 @@ static void kill_css(struct cgroup_subsys_state *css);
> static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
> bool is_add);
>
> +struct cgroup_namespace init_cgroup_ns = {
> + .count = {
> + .counter = 1,
> + },
> + .proc_inum = PROC_CGROUP_INIT_INO,
> + .user_ns = &init_user_ns,

This might mean that you should bump the init_user_ns refcount.

> + .root_cgrp = &cgrp_dfl_root.cgrp,
> +};
> +
> /* IDR wrappers which synchronize using cgroup_idr_lock */
> static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
> gfp_t gfp_mask)
> diff --git a/kernel/cgroup_namespace.c b/kernel/cgroup_namespace.c
> new file mode 100644
> index 0000000..c16604f
> --- /dev/null
> +++ b/kernel/cgroup_namespace.c
> @@ -0,0 +1,128 @@
> +
> +#include <linux/cgroup.h>
> +#include <linux/cgroup_namespace.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/nsproxy.h>
> +#include <linux/proc_ns.h>
> +
> +static struct cgroup_namespace *alloc_cgroup_ns(void)
> +{
> + struct cgroup_namespace *new_ns;
> +
> + new_ns = kmalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
> + if (new_ns)
> + atomic_set(&new_ns->count, 1);
> + return new_ns;
> +}
> +
> +void free_cgroup_ns(struct cgroup_namespace *ns)
> +{
> + cgroup_put(ns->root_cgrp);
> + put_user_ns(ns->user_ns);

This is a problem on error patch in copy_cgroup_ns. The
alloc_cgroup_ns() doesn't initialize these values, so if
you should fail in proc_alloc_inum() you'll show up here
with fandom values in ns->*.

> + proc_free_inum(ns->proc_inum);
> +}
> +EXPORT_SYMBOL(free_cgroup_ns);
> +
> +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
> + struct user_namespace *user_ns,
> + struct cgroup_namespace *old_ns)
> +{
> + struct cgroup_namespace *new_ns = NULL;
> + struct cgroup *cgrp = NULL;
> + int err;
> +
> + BUG_ON(!old_ns);
> +
> + if (!(flags & CLONE_NEWCGROUP))
> + return get_cgroup_ns(old_ns);
> +
> + /* Allow only sysadmin to create cgroup namespace. */
> + err = -EPERM;
> + if (!ns_capable(user_ns, CAP_SYS_ADMIN))
> + goto err_out;
> +
> + /* Prevent cgroup changes for this task. */
> + threadgroup_lock(current);
> +
> + cgrp = get_task_cgroup(current);
> +
> + /* Creating new CGROUPNS is supported only when unified hierarchy is in
> + * use. */

Oh, drat. Well, I'll take, it, but under protest :)

> + err = -EINVAL;
> + if (!cgroup_on_dfl(cgrp))
> + goto err_out_unlock;
> +
> + err = -ENOMEM;
> + new_ns = alloc_cgroup_ns();
> + if (!new_ns)
> + goto err_out_unlock;
> +
> + err = proc_alloc_inum(&new_ns->proc_inum);
> + if (err)
> + goto err_out_unlock;
> +
> + new_ns->user_ns = get_user_ns(user_ns);
> + new_ns->root_cgrp = cgrp;
> +
> + threadgroup_unlock(current);
> +
> + return new_ns;
> +
> +err_out_unlock:
> + threadgroup_unlock(current);
> +err_out:
> + if (cgrp)
> + cgroup_put(cgrp);
> + kfree(new_ns);
> + return ERR_PTR(err);
> +}
> +
> +static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
> +{
> + pr_info("setns not supported for cgroup namespace");
> + return -EINVAL;
> +}
> +
> +static void *cgroupns_get(struct task_struct *task)
> +{
> + struct cgroup_namespace *ns = NULL;
> + struct nsproxy *nsproxy;
> +
> + rcu_read_lock();
> + nsproxy = task->nsproxy;
> + if (nsproxy) {
> + ns = nsproxy->cgroup_ns;
> + get_cgroup_ns(ns);
> + }
> + rcu_read_unlock();
> +
> + return ns;
> +}
> +
> +static void cgroupns_put(void *ns)
> +{
> + put_cgroup_ns(ns);
> +}
> +
> +static unsigned int cgroupns_inum(void *ns)
> +{
> + struct cgroup_namespace *cgroup_ns = ns;
> +
> + return cgroup_ns->proc_inum;
> +}
> +
> +const struct proc_ns_operations cgroupns_operations = {
> + .name = "cgroup",
> + .type = CLONE_NEWCGROUP,
> + .get = cgroupns_get,
> + .put = cgroupns_put,
> + .install = cgroupns_install,
> + .inum = cgroupns_inum,
> +};
> +
> +static __init int cgroup_namespaces_init(void)
> +{
> + return 0;
> +}
> +subsys_initcall(cgroup_namespaces_init);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0cf9cdb..cc06851 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1790,7 +1790,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
> if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
> CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
> CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
> - CLONE_NEWUSER|CLONE_NEWPID))
> + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
> return -EINVAL;
> /*
> * Not implemented, but pretend it works if there is nothing to
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index ef42d0a..a8b1970 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -25,6 +25,7 @@
> #include <linux/proc_ns.h>
> #include <linux/file.h>
> #include <linux/syscalls.h>
> +#include <linux/cgroup_namespace.h>
>
> static struct kmem_cache *nsproxy_cachep;
>
> @@ -39,6 +40,7 @@ struct nsproxy init_nsproxy = {
> #ifdef CONFIG_NET
> .net_ns = &init_net,
> #endif
> + .cgroup_ns = &init_cgroup_ns,
> };
>
> static inline struct nsproxy *create_nsproxy(void)
> @@ -92,6 +94,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
> goto out_pid;
> }
>
> + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
> + tsk->nsproxy->cgroup_ns);
> + if (IS_ERR(new_nsp->cgroup_ns)) {
> + err = PTR_ERR(new_nsp->cgroup_ns);
> + goto out_cgroup;
> + }
> +
> new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
> if (IS_ERR(new_nsp->net_ns)) {
> err = PTR_ERR(new_nsp->net_ns);
> @@ -101,6 +110,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
> return new_nsp;
>
> out_net:
> + if (new_nsp->cgroup_ns)
> + put_cgroup_ns(new_nsp->cgroup_ns);
> +out_cgroup:
> if (new_nsp->pid_ns_for_children)
> put_pid_ns(new_nsp->pid_ns_for_children);
> out_pid:
> @@ -128,7 +140,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> struct nsproxy *new_ns;
>
> if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> - CLONE_NEWPID | CLONE_NEWNET)))) {
> + CLONE_NEWPID | CLONE_NEWNET |
> + CLONE_NEWCGROUP)))) {
> get_nsproxy(old_ns);
> return 0;
> }
> @@ -165,6 +178,8 @@ void free_nsproxy(struct nsproxy *ns)
> put_ipc_ns(ns->ipc_ns);
> if (ns->pid_ns_for_children)
> put_pid_ns(ns->pid_ns_for_children);
> + if (ns->cgroup_ns)
> + put_cgroup_ns(ns->cgroup_ns);
> put_net(ns->net_ns);
> kmem_cache_free(nsproxy_cachep, ns);
> }
> @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
> int err = 0;
>
> if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> - CLONE_NEWNET | CLONE_NEWPID)))
> + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
> return 0;
>
> user_ns = new_cred ? new_cred->user_ns : current_user_ns();
> --
> 2.1.0.rc2.206.gedb03e5
>
> _______________________________________________
> Containers mailing list
> Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
> https://lists.linuxfoundation.org/mailman/listinfo/containers
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/