[RFC][PATCH] cgroup: fix race between fork and cgroup freezing
From: Li Zefan
Date: Thu Mar 08 2012 - 03:42:19 EST
A similar bug exists in cpuset, and those are long-standing bugs.
As reported by Frederic:
> When a user freezes a cgroup, the freezer sets the subsystem state
> to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.
>
> But there is a possible race here, although unlikely, if a task
> forks and the parent is preempted between write_unlock(tasklist_lock)
> and cgroup_post_fork(). If we freeze the cgroup while the parent
> is sleeping and the parent wakes up thereafter, its child will
> be missing from the set of tasks to freeze because:
>
> - The child was not yet linked to its css_set->tasks, as is done
> from cgroup_post_fork(). cgroup_iter_start() has thus missed it.
>
> - The cgroup freezer's fork callback can handle that child but
> cgroup_fork_callbacks() has been called already.
I try to fix it by using seqcount. We read the counter before calling
cgroup_fork_callbacks(), and we check the counter after cgroup_post_fork().
If the seq numbers don't match, we know the forking task's cgroup
has been/is being frozen, so we freeze the child task.
cpuset can be fixed accordingly.
Reported-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Signed-off-by: Li Zefan <lizf@xxxxxxxxxxxxxx>
---
include/linux/cgroup.h | 5 +++--
include/linux/init_task.h | 8 ++++++++
include/linux/sched.h | 1 +
kernel/cgroup.c | 18 ++++++++++++++++--
kernel/cgroup_freezer.c | 22 ++++++++++++++++++++++
kernel/fork.c | 5 +++--
6 files changed, 53 insertions(+), 6 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9b6021..38c82b2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -32,8 +32,8 @@ extern int cgroup_lock_is_held(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_fork_callbacks(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern void cgroup_fork_callbacks(struct task_struct *p, int *seq);
+extern void cgroup_post_fork(struct task_struct *p, int seq);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
@@ -495,6 +495,7 @@ struct cgroup_subsys {
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup_taskset *tset);
void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
+ void (*post_fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9c66b1a..91665ce 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -125,6 +125,13 @@ extern struct cred init_cred;
# define INIT_PERF_EVENTS(tsk)
#endif
+#ifdef CONFIG_CGROUPS
+# define INIT_CGROUPS(tsk) \
+ .fork_seq = SEQCNT_ZERO
+#else
+# define INIT_CGROUPS(tsk)
+#endif
+
#define INIT_TASK_COMM "swapper"
/*
@@ -192,6 +199,7 @@ extern struct cred init_cred;
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
+ INIT_CGROUPS(tsk) \
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d379a6..d487777 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1507,6 +1507,7 @@ struct task_struct {
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
+ seqcount_t fork_seq;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b53..77435ec 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/seqlock.h>
#include <linux/atomic.h>
@@ -4558,6 +4559,7 @@ void cgroup_fork(struct task_struct *child)
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
INIT_LIST_HEAD(&child->cg_list);
+ seqcount_init(&child->fork_seq);
}
/**
@@ -4568,8 +4570,10 @@ void cgroup_fork(struct task_struct *child)
* tasklist. No need to take any locks since no-one can
* be operating on this task.
*/
-void cgroup_fork_callbacks(struct task_struct *child)
+void cgroup_fork_callbacks(struct task_struct *child, int *seq)
{
+ *seq = read_seqcount_begin(¤t->fork_seq);
+
if (need_forkexit_callback) {
int i;
/*
@@ -4594,7 +4598,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
* with the first call to cgroup_iter_start() - to guarantee that the
* new task ends up on its list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child, int seq)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
@@ -4612,6 +4616,16 @@ void cgroup_post_fork(struct task_struct *child)
list_add(&child->cg_list, &child->cgroups->tasks);
}
write_unlock(&css_set_lock);
+
+ if (read_seqcount_retry(¤t->fork_seq, seq)) {
+ int i;
+
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->post_fork)
+ ss->post_fork(ss, child);
+ }
+ }
}
}
/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b..88f210e 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -216,6 +216,25 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
spin_unlock_irq(&freezer->lock);
}
+static void freezer_post_fork(struct cgroup_subsys *ss,
+ struct task_struct *task)
+{
+ struct freezer *freezer;
+
+ cgroup_lock();
+
+ freezer = task_freezer(task);
+ if (!freezer->css.cgroup->parent)
+ goto out;
+
+ spin_lock_irq(&freezer->lock);
+ if (freezer->state != CGROUP_THAWED)
+ freeze_task(task);
+ spin_unlock_irq(&freezer->lock);
+out:
+ cgroup_unlock();
+}
+
/*
* caller must hold freezer->lock
*/
@@ -286,6 +305,8 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
continue;
if (!freezing(task) && !freezer_should_skip(task))
num_cant_freeze_now++;
+
+ write_seqcount_barrier(&task->fork_seq);
}
cgroup_iter_end(cgroup, &it);
@@ -381,4 +402,5 @@ struct cgroup_subsys freezer_subsys = {
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
.fork = freezer_fork,
+ .post_fork = freezer_post_fork,
};
diff --git a/kernel/fork.c b/kernel/fork.c
index e2cd3e2..73abc25 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1077,6 +1077,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
int retval;
struct task_struct *p;
int cgroup_callbacks_done = 0;
+ int seq;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1331,7 +1332,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
* on the tasklist. */
- cgroup_fork_callbacks(p);
+ cgroup_fork_callbacks(p, &seq);
cgroup_callbacks_done = 1;
/* Need tasklist lock for parent etc handling! */
@@ -1395,7 +1396,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, seq);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/