[PATCH] new cgroup controller "fork"

From: Max Kellermann
Date: Thu Nov 03 2011 - 12:31:29 EST


Can limit the number of fork()/clone() calls in a cgroup. It is
useful as a safeguard against fork bombs.

Signed-off-by: Max Kellermann <mk@xxxxxxxxxx>
---
Documentation/cgroups/fork.txt | 30 ++++++
include/linux/cgroup_fork.h | 26 +++++
include/linux/cgroup_subsys.h | 6 +
init/Kconfig | 6 +
kernel/Makefile | 1
kernel/cgroup_fork.c | 197 ++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 5 +
7 files changed, 271 insertions(+), 0 deletions(-)
create mode 100644 Documentation/cgroups/fork.txt
create mode 100644 include/linux/cgroup_fork.h
create mode 100644 kernel/cgroup_fork.c

diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
new file mode 100644
index 0000000..dfbf291
--- /dev/null
+++ b/Documentation/cgroups/fork.txt
@@ -0,0 +1,30 @@
+The "fork" Controller
+---------------------
+
+The "fork" controller limits the number of times a new child process
+or thread can be created. It maintains a per-group counter which gets
+decremented on each fork() / clone(). When the counter reaches zero,
+no process in the cgroup is allowed to create new child
+processes/threads, even if existing ones quit.
+
+This has been proven useful in a shared hosting environment. A new
+temporary cgroup is created for each CGI process, and the maximum fork
+count is configured to a sensible value. Since CGIs are expected to
+run for only a short time with predictable resource usage, this may be
+an appropriate tool to limit the damage that a freaked CGI can do.
+
+Initially, the counter is set to -1, which is a magic value for
+"disabled" - no limits are imposed on the processes in the group. To
+set a new value, type (in the working directory of that control
+group):
+
+ echo 16 > fork.remaining
+
+This examples allows 16 forks in the control group. 0 means no
+further forks are allowed. The limit may be lowered or increased or
+even disabled at any time by a process with write permissions to the
+attribute.
+
+To check if a fork is allowed, the controller walks the cgroup
+hierarchy up, and verifies all ancestors. The counter of all
+ancestors is decreased.
diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
new file mode 100644
index 0000000..4ac66b3
--- /dev/null
+++ b/include/linux/cgroup_fork.h
@@ -0,0 +1,26 @@
+#ifndef _LINUX_CGROUP_FORK_H
+#define _LINUX_CGROUP_FORK_H
+
+#ifdef CONFIG_CGROUP_FORK
+
+/**
+ * Checks if another fork is allowed. Call this before creating a new
+ * child process.
+ *
+ * @return 0 on success, a negative errno value if forking should be
+ * denied
+ */
+int
+cgroup_fork_pre_fork(void);
+
+#else /* !CONFIG_CGROUP_FORK */
+
+static inline int
+cgroup_fork_pre_fork(void)
+{
+ return 0;
+}
+
+#endif /* !CONFIG_CGROUP_FORK */
+
+#endif /* !_LINUX_CGROUP_FORK_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c1..e2dbd65 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -64,3 +64,9 @@ SUBSYS(perf)
#endif

/* */
+
+#ifdef CONFIG_CGROUP_FORK
+SUBSYS(fork)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 31ba0fd..7a2fe2e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -603,6 +603,12 @@ config CGROUP_FREEZER
Provides a way to freeze and unfreeze all tasks in a
cgroup.

+config CGROUP_FORK
+ bool "fork controller for cgroups"
+ help
+ Limits the number of fork() calls in a cgroup. An application
+ for this is to make a cgroup safe against fork bombs.
+
config CGROUP_DEVICE
bool "Device controller for cgroups"
help
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b..2aab192 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
new file mode 100644
index 0000000..e9aa650
--- /dev/null
+++ b/kernel/cgroup_fork.c
@@ -0,0 +1,197 @@
+/*
+ * A cgroup implementation which limits the number of fork() calls.
+ * See Documentation/cgroups/fork.txt for more information.
+ *
+ * Copyright 2011 Content Management AG
+ * Author: Max Kellermann <mk@xxxxxxxxxx>
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
+#include <linux/slab.h>
+
+struct cgroup_fork {
+ struct cgroup_subsys_state css;
+
+ /** protect the "remaining" attribute */
+ spinlock_t lock;
+
+ /**
+ * The remaining number of forks allowed. -1 is the magic
+ * value for "unlimited".
+ */
+ int remaining;
+};
+
+/**
+ * Get the #cgroup_fork instance of the specified #cgroup.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_group(struct cgroup *cgroup)
+{
+ return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
+ struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the specified task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_task(struct task_struct *task)
+{
+ return container_of(task_subsys_state(task, fork_subsys_id),
+ struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the current task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_current(void)
+{
+ return cgroup_fork_task(current);
+}
+
+static __pure int
+cgroup_fork_lock_get_remaining(struct cgroup_fork *t)
+{
+ unsigned remaining;
+
+ spin_lock(&t->lock);
+ remaining = t->remaining;
+ spin_unlock(&t->lock);
+
+ return remaining;
+}
+
+static struct cgroup_subsys_state *
+cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+ struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&t->lock);
+
+ t->remaining = -1;
+
+ return &t->css;
+}
+
+static void
+cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+ struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+ kfree(t);
+}
+
+static void
+cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+ struct cgroup_fork *t;
+
+ rcu_read_lock();
+
+ /* decrement the counters in the cgroup and all of its
+ ancestors (except for the root cgroup) */
+
+ t = cgroup_fork_current();
+ while (t->css.cgroup->parent != NULL) {
+ spin_lock(&t->lock);
+ if (t->remaining > 0)
+ --t->remaining;
+ spin_unlock(&t->lock);
+
+ t = cgroup_fork_group(t->css.cgroup->parent);
+ }
+
+ rcu_read_unlock();
+}
+
+static s64
+cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft)
+{
+ struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+ return cgroup_fork_lock_get_remaining(t);
+}
+
+static int
+cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
+ s64 value)
+{
+ struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+ if (value < -1 || value > (1L << 30))
+ return -EINVAL;
+
+ spin_lock(&t->lock);
+ t->remaining = (int)value;
+ spin_unlock(&t->lock);
+
+ return 0;
+}
+
+static const struct cftype cgroup_fork_files[] = {
+ {
+ .name = "remaining",
+ .read_s64 = cgroup_fork_remaining_read,
+ .write_s64 = cgroup_fork_remaining_write,
+ },
+};
+
+static int
+cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+ if (cgroup->parent == NULL)
+ /* cannot limit the root cgroup */
+ return 0;
+
+ return cgroup_add_files(cgroup, ss, cgroup_fork_files,
+ ARRAY_SIZE(cgroup_fork_files));
+}
+
+struct cgroup_subsys fork_subsys = {
+ .name = "fork",
+ .create = cgroup_fork_create,
+ .destroy = cgroup_fork_destroy,
+ .fork = cgroup_fork_fork,
+ .populate = cgroup_fork_populate,
+ .subsys_id = fork_subsys_id,
+};
+
+int
+cgroup_fork_pre_fork(void)
+{
+ struct cgroup_fork *t;
+ int err = 0;
+
+ if (unlikely(current == &init_task))
+ /* ignore the kernel's fork request while booting; the
+ cgroup subsystem doesn't get initialized by
+ INIT_TASK(), so we need this check */
+ return err;
+
+ BUG_ON(current->cgroups == NULL);
+
+ rcu_read_lock();
+
+ t = cgroup_fork_current();
+ while (t->css.cgroup->parent != NULL && err == 0) {
+ if (unlikely(cgroup_fork_lock_get_remaining(t) == 0)) {
+ err = -EPERM;
+ break;
+ }
+
+ t = cgroup_fork_group(t->css.cgroup->parent);
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 70d7619..c8cba7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
@@ -1084,6 +1085,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);

+ retval = cgroup_fork_pre_fork();
+ if (retval)
+ goto fork_out;
+
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/