[PATCH v4 2/2] cgroup: implement subtree creation on copy_cgroup_ns()

From: Aleksa Sarai
Date: Fri May 13 2016 - 23:20:35 EST


Allow unprivileged processes to control subtrees of their associated
processes, a necessary feature if a rootless container wishes to take
advantage of cgroups for its own processes.

As cgroups are hierarchical, having the ability to set limits in a
subtree does not preclude the ability to modify the limits imposed by
parent cgroups. In addition, in the default hierarchy a process must
have write access to the common ancestor of the two (src and dest)
cgroups' cgroup.procs file. This makes this change safe against cgroup
escape.

There isn't a way to disable this at the moment.

Signed-off-by: Aleksa Sarai <asarai@xxxxxxx>
---
kernel/cgroup.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 120 insertions(+), 23 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f1c798b69561..f455488dc899 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
+#include <linux/time.h>
#include <net/sock.h>

/*
@@ -5269,34 +5270,40 @@ out_destroy:
return ERR_PTR(ret);
}

-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+/**
+ * cgroup_create_subtree - creates a new subtree of a cgroup
+ * @parent: the parent cgroup to create the subtree under
+ * @name: the name of the cgroup in kernfs
+ * @mode: the mode of the cgroup in kernfs
+ *
+ * Creates a new cgroup under the given @parent, with the given @name and @mode.
+ * The caller must hold cgroup_mutex, and must not be under active protection of
+ * kernfs.
+ */
+static struct cgroup *cgroup_create_subtree(struct cgroup *parent,
+ const char *name, umode_t mode)
{
- struct cgroup *parent, *cgrp;
+ struct cgroup *child;
struct kernfs_node *kn;
- int ret;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);

/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
if (strchr(name, '\n'))
- return -EINVAL;
-
- parent = cgroup_kn_lock_live(parent_kn, false);
- if (!parent)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);

- cgrp = cgroup_create(parent);
- if (IS_ERR(cgrp)) {
- ret = PTR_ERR(cgrp);
- goto out_unlock;
- }
+ child = cgroup_create(parent);
+ if (IS_ERR(child))
+ return child;

/* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ kn = kernfs_create_dir(parent->kn, name, mode, child);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_destroy;
}
- cgrp->kn = kn;
+ child->kn = kn;

/*
* This extra ref will be put in cgroup_free_fn() and guarantees
@@ -5308,22 +5315,51 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;

- ret = css_populate_dir(&cgrp->self);
+ ret = css_populate_dir(&child->self);
if (ret)
goto out_destroy;

- ret = cgroup_apply_control_enable(cgrp);
+ ret = cgroup_apply_control_enable(child);
if (ret)
goto out_destroy;

/* let's create and online css's */
kernfs_activate(kn);

- ret = 0;
- goto out_unlock;
+ return child;

out_destroy:
- cgroup_destroy_locked(cgrp);
+ cgroup_destroy_locked(child);
+ return ERR_PTR(ret);
+}
+
+/*
+ * cgroup directories starting with this prefix are forbidden from being created
+ * from userspace. This prefix is used internally to make sure that there's no
+ * conflicts with userspace when creating cgroups inside copy_cgroup_ns().
+ */
+#define CGROUPNS_INTERNAL_PREFIX ".__cgroupns_subtree:"
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ int ret = 0;
+
+ if (strncmp(CGROUPNS_INTERNAL_PREFIX, name,
+ strlen(CGROUPNS_INTERNAL_PREFIX)) == 0)
+ return -EINVAL;
+
+ parent = cgroup_kn_lock_live(parent_kn, false);
+ if (!parent)
+ return -ENODEV;
+
+ cgrp = cgroup_create_subtree(parent, name, mode);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
+ }
+
out_unlock:
cgroup_kn_unlock(parent_kn);
return ret;
@@ -6298,7 +6334,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
+ struct cgroup_root *root;
struct css_set *cset;
+ char id[16], id_string[1+2*ARRAY_SIZE(id)] = {0};

BUG_ON(!old_ns);

@@ -6311,12 +6349,71 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);

+ /*
+ * In order to make sure that the dirname we create is unique, we use a
+ * random id for all of the subtrees. The ID is the same to reduce
+ * confusion when reading /proc/<pid>/cgroup.
+ */
+ get_random_bytes(id, ARRAY_SIZE(id));
+ bin2hex(id_string, id, ARRAY_SIZE(id));
+
+ /*
+ * Create a new subtree in every cgroup the task is associated with.
+ * The cgroup is owned by the task uid and gid, to allow for management
+ * of subtrees in cgroup namespaces. This is safe because:
+ *
+ * 1. cgroups are hierarchical, so having the ability to set limits in
+ * a subtree does not preclude the ability to modify the limits
+ * imposed by parent cgroups.
+ *
+ * 2. cgroup_procs_write_permission() does checks to ensure that a
+ * task cannot move other tasks into its cgroup unless they are both
+ * running as the same user (or the task moving the process has
+ * CAP_SYS_ADMIN in the user namespace of the process being moved).
+ * This means that a misbehaving process can't start messing around
+ * with other processes' cgroup associations.
+ *
+ * 3. On the default hierarchy, you cannot migrate a process to a
+ * non-descendant cgroup unless you have write access to the
+ * cgroup.procs file in the common ancestor of the two cgroups. This
+ * means that two cooperative processes in the default hierarchy
+ * can't move processes between their cgroups (if the admin
+ * disallows it). Unfortunately, this functionality doesn't exist in
+ * the other hierarchies (for backwards compatibility reasons).
+ * However, this requirement isn't as important as the previous two.
+ */
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ for_each_root(root) {
+ struct cgroup *parent, *child;
+ char namebuf[CGROUP_FILE_NAME_MAX];
+ bool is_dfl = cgroup_on_dfl(&root->cgrp);
+
+ spin_lock_bh(&css_set_lock);
+ parent = task_cgroup_from_root(current, root);
+ spin_unlock_bh(&css_set_lock);
+
+ snprintf(namebuf, CGROUP_FILE_NAME_MAX,
+ CGROUPNS_INTERNAL_PREFIX "%s", id_string);
+
+ /* This should not fail, since we're under &cgroup_mutex. */
+ child = cgroup_create_subtree(parent, namebuf, 0755);
+ if (WARN_ON(IS_ERR(child)))
+ continue;

+ /*
+ * Move the task to the new cgroup, which is owned by the user.
+ * Should never fail, since we're under &cgroup_mutex here.
+ */
+ rcu_read_lock();
+ if (WARN_ON(cgroup_attach_task(child, current, is_dfl)))
+ cgroup_destroy_locked(child);
+ rcu_read_unlock();
+
+ }
+
+ spin_lock_bh(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
-
spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);

--
2.8.2