[PATCH v2 3/6] cgroups: Allow to unbind subsystem from a cgroup hierarchy

From: Li Zefan
Date: Wed Dec 15 2010 - 04:35:44 EST


This allows us to unbind a cgroup subsystem from a hierarchy
which has sub-cgroups in it.

If a subsystem is to support unbinding, when pinning a cgroup
via css refcnt, it should use __css_tryget() instead of css_get().

Usage:

# mount -t cgroup -o cpuset,cpuacct xxx /mnt
# mkdir /mnt/tmp
# echo $$ > /mnt/tmp/tasks

(remove it from the hierarchy)
# mount -o remount,cpuset xxx /mnt

Changelog v2:

- Allow a cgroup subsystem to use css refcnt.
- Add more code comments.
- Use rcu_assign_pointer() in hierarchy_update_css_sets().
- Split can_bind flag to bindable and unbindable flags.

Signed-off-by: Li Zefan <lizf@xxxxxxxxxxxxxx>
---
include/linux/cgroup.h | 17 ++++++
kernel/cgroup.c | 139 +++++++++++++++++++++++++++++++++++++++++------
2 files changed, 138 insertions(+), 18 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d8c4e22..17579b2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -110,6 +110,18 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css)
}

/*
+ * For a subsystem which supports unbinding, call this to get css
+ * refcnt. Called with rcu_read_lock or cgroup_mutex held.
+ */
+
+static inline bool __css_tryget(struct cgroup_subsys_state *css)
+{
+ if (test_bit(CSS_ROOT, &css->flags))
+ return true;
+ return atomic_inc_not_zero(&css->refcnt);
+}
+
+/*
* Call css_tryget() to take a reference on a css if your existing
* (known-valid) reference isn't already ref-counted. Returns false if
* the css has been destroyed.
@@ -495,6 +507,11 @@ struct cgroup_subsys {
* which has child cgroups.
*/
bool bindable:1;
+ /*
+ * Indicate if this subsystem can be removed from a cgroup hierarchy
+ * which has child cgroups.
+ */
+ bool unbindable:1;

#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index caac80f..463575d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1055,12 +1055,61 @@ static int hierarchy_attach_css(struct cgroup *cgrp, void *data)
}

/*
- * After attaching new css objects to the cgroup, we need to entangle
- * them into the existing css_sets.
+ * Reset those css objects whose refcnts are cleared.
*/
-static int hierarchy_update_css_sets(struct cgroup *cgrp, void *data)
+static int hierarchy_reset_css_refs(struct cgroup *cgrp, void *data)
+{
+ unsigned long removed_bits = (unsigned long)data;
+ int i;
+
+ for_each_set_bit(i, &removed_bits, CGROUP_SUBSYS_COUNT) {
+ if (atomic_read(&css->refcnt) == 0)
+ atomic_set(&css->refcnt, 1);
+ }
+ return 0;
+}
+
+/*
+ * Clear all the css objects' refcnt to 0. If there's a refcnt > 1,
+ * return failure.
+ */
+static int hierarchy_clear_css_refs(struct cgroup *cgrp, void *data)
+{
+ unsigned long removed_bits = (unsigned long)data;
+ int i;
+
+ for_each_set_bit(i, &removed_bits, CGROUP_SUBSYS_COUNT) {
+ struct cgroup_subsys_state *css = cgrp->subsys[i];
+
+ if (atomic_cmpxchg(&css->refcnt, 1, 0) != 1)
+ goto failed;
+ }
+ return 0;
+failed:
+ hierarchy_reset_css_refs(struct cgroup *cgrp, void *data);
+ return -EBUSY;
+}
+
+/*
+ * We're removing some subsystems from cgroup hierarchy, and here we
+ * remove and destroy the css objects from each cgroup.
+ */
+static int hierarchy_remove_css(struct cgroup *cgrp, void *data)
+{
+ unsigned long removed_bits = (unsigned long)data;
+ int i;
+
+ for_each_set_bit(i, &removed_bits, CGROUP_SUBSYS_COUNT) {
+ subsys[i]->destroy(subsys[i], cgrp);
+ cgrp->subsys[i] = NULL;
+ }
+
+ return 0;
+}
+
+static int hierarchy_update_css_sets(struct cgroup *cgrp,
+ unsigned long bits, bool add)
{
- unsigned long added_bits = (unsigned long)data;
int i;
struct cg_cgroup_link *link;

@@ -1069,8 +1118,14 @@ static int hierarchy_update_css_sets(struct cgroup *cgrp, void *data)
struct css_set *cg = link->cg;
struct hlist_head *hhead;

- for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT)
- rcu_assign_pointer(cg->subsys[i], cgrp->subsys[i]);
+ for_each_set_bit(i, &bits, CGROUP_SUBSYS_COUNT) {
+ if (add)
+ rcu_assign_pointer(cg->subsys[i],
+ cgrp->subsys[i]);
+ else
+ rcu_assign_pointer(cg->subsys[i],
+ dummytop->subsys[i]);
+ }

/* rehash */
hlist_del(&cg->hlist);
@@ -1083,6 +1138,30 @@ static int hierarchy_update_css_sets(struct cgroup *cgrp, void *data)
}

/*
+ * After attaching new css objects to the cgroup, we need to entangle
+ * them into the existing css_sets.
+ */
+static int hierarchy_add_to_css_sets(struct cgroup *cgrp, void *data)
+{
+ unsigned long added_bits = (unsigned long)data;
+
+ hierarchy_update_css_sets(cgrp, added_bits, true);
+ return 0;
+}
+
+/*
+ * Before dettaching and destroying css objects from the cgroup, we
+ * should detangle them from the existing css_sets.
+ */
+static int hierarchy_remove_from_css_sets(struct cgroup *cgrp, void *data)
+{
+ unsigned long removed_bits = (unsigned long)data;
+
+ hierarchy_update_css_sets(cgrp, removed_bits, false);
+ return 0;
+}
+
+/*
* Re-populate each cgroup directory.
*
* Note root cgroup's inode mutex is held.
@@ -1127,18 +1206,17 @@ static int rebind_subsystems(struct cgroupfs_root *root,
}
}

- /* Removing will be supported later */
- if (root->number_of_cgroups > 1 && removed_bits)
- return -EBUSY;
-
/*
* For non-trivial hierarchy, check that added subsystems
- * are all bindable
+ * are all bindable and removed subsystems are all unbindable
*/
if (root->number_of_cgroups > 1) {
for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT)
if (!subsys[i]->bindable)
return -EBUSY;
+ for_each_set_bit(i, &removed_bits, CGROUP_SUBSYS_COUNT)
+ if (!subsys[i]->unbindable)
+ return -EBUSY;
}

/* Attach css objects to the top cgroup */
@@ -1154,9 +1232,14 @@ static int rebind_subsystems(struct cgroupfs_root *root,
err = cgroup_walk_hierarchy(hierarchy_attach_css,
(void *)added_bits, cgrp);
if (err)
- goto failed;
+ goto out;
+
+ err = cgroup_walk_hierarchy(hierarchy_clear_css_refs,
+ (void *)removed_bits, cgrp);
+ if (err)
+ goto out_remove_css;

- cgroup_walk_hierarchy(hierarchy_update_css_sets,
+ cgroup_walk_hierarchy(hierarchy_add_to_css_sets,
(void *)added_bits, cgrp);

/* Process each subsystem */
@@ -1176,11 +1259,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(ss == NULL);
- BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
- dummytop->subsys[i]->cgroup = dummytop;
- cgrp->subsys[i] = NULL;
if (ss->bind)
ss->bind(ss, dummytop);
subsys[i]->root = &rootnode;
@@ -1206,11 +1285,35 @@ static int rebind_subsystems(struct cgroupfs_root *root,
}
}
root->subsys_bits = root->actual_subsys_bits = final_bits;
+
+ for_each_set_bit(i, &removed_bits, CGROUP_SUBSYS_COUNT) {
+ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
+ BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+
+ dummytop->subsys[i]->cgroup = dummytop;
+ cgrp->subsys[i] = NULL;
+ }
+
+ cgroup_walk_hierarchy(hierarchy_remove_from_css_sets,
+ (void *)removed_bits, cgrp);
+
+ /*
+ * There might be some pointers to the cgrouip_subsys_state
+ * that we are going to destroy.
+ */
+ synchronize_rcu();
+
+ cgroup_walk_hierarchy(hierarchy_remove_css,
+ (void *)removed_bits, cgrp);
+
synchronize_rcu();

return 0;

-failed:
+out_remove_css:
+ cgroup_walk_hierarchy(hierarchy_remove_css,
+ (void *)added_bits, cgrp);
+out:
for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT)
cgrp->subsys[i] = NULL;

--
1.6.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/