Re: [PATCH] sched: fix another race when reading /proc/sched_debug

From: Paul Menage
Date: Tue Dec 16 2008 - 07:42:43 EST


On Tue, Dec 16, 2008 at 1:41 AM, Paul Menage <menage@xxxxxxxxxx> wrote:
>
> It should be OK to rcu-free cgrp and have it still safe to call
> cgroup_path() on it - as long as we're confident that the dentry (and
> all its parents) won't be released until after the RCU section as
> well, since that's where we get the path elements from. And that does
> seem to be the case from looking at dcache.c
>
> It's certainly nicer than having two calls to synchronize_rcu() in
> quick succession in cgroup_diput().

How about something like this?

Paul

include/linux/cgroup.h | 9 +++++++++
kernel/cgroup.c | 49 ++++++++++++++++++++++++++++---------------------
2 files changed, 37 insertions(+), 21 deletions(-)

Index: cgroup-rcu-mmotm-2008-12-09/include/linux/cgroup.h
===================================================================
--- cgroup-rcu-mmotm-2008-12-09.orig/include/linux/cgroup.h
+++ cgroup-rcu-mmotm-2008-12-09/include/linux/cgroup.h
@@ -145,6 +145,9 @@ struct cgroup {
int pids_use_count;
/* Length of the current tasks_pids array */
int pids_length;
+
+ /* For RCU-protected deletion */
+ struct rcu_head rcu_head;
};

/* A css_set is a structure holding pointers to a set of
@@ -305,6 +308,12 @@ int cgroup_add_files(struct cgroup *cgrp

int cgroup_is_removed(const struct cgroup *cgrp);

+/*
+ * cgroup_path() fills in a filesystem-like path for the given cgroup
+ * into "buf", up to "buflen" characters. Should be called with
+ * cgroup_mutex held, or else in an RCU section with an RCU-protected
+ * cgroup reference
+ */
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);

int cgroup_task_count(const struct cgroup *cgrp);
Index: cgroup-rcu-mmotm-2008-12-09/kernel/cgroup.c
===================================================================
--- cgroup-rcu-mmotm-2008-12-09.orig/kernel/cgroup.c
+++ cgroup-rcu-mmotm-2008-12-09/kernel/cgroup.c
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set

rcu_read_lock();
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = cg->subsys[i]->cgroup;
+ struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
@@ -595,6 +595,18 @@ static void cgroup_call_pre_destroy(stru
return;
}

+static void __free_cgroup_rcu(struct rcu_head *obj)
+{
+ struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup
+ */
+ deactivate_super(cgrp->root->sb);
+
+ kfree(cgrp);
+}
+
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -602,13 +614,6 @@ static void cgroup_diput(struct dentry *
struct cgroup *cgrp = dentry->d_fsdata;
struct cgroup_subsys *ss;
BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();

mutex_lock(&cgroup_mutex);
/*
@@ -620,11 +625,7 @@ static void cgroup_diput(struct dentry *
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);

- /* Drop the active superblock reference that we took when we
- * created the cgroup */
- deactivate_super(cgrp->root->sb);
-
- kfree(cgrp);
+ call_rcu(&cgrp->rcu_head, __free_cgroup_rcu);
}
iput(inode);
}
@@ -1134,8 +1135,9 @@ static inline struct cftype *__d_cft(str
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference. Writes path of cgroup into buf. Returns 0 on success,
+ * -errno on error.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
@@ -2440,12 +2442,16 @@ static int cgroup_has_css_refs(struct cg
if (ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway. */
+ /*
+ * When called from check_for_release() it's possible
+ * that by this point the cgroup has been removed and
+ * the css deleted (since css objects are not
+ * necessarily RCU-protected). But a false-positive
+ * doesn't matter, since it can only happen if the
+ * cgroup (which *is* RCU-protected) has been removed
+ * and hence no longer needs the release agent to be
+ * called anyway.
+ */
if (css && atomic_read(&css->refcnt))
return 1;
}
@@ -2487,6 +2493,7 @@ static int cgroup_rmdir(struct inode *un
return -EBUSY;
}

+ /* Synchronize with check_for_release() */
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/