[PATCH v5 6/6] seccomp: add SECCOMP_EXT_ACT_TSYNC and SECCOMP_FILTER_TSYNC
From: Kees Cook
Date: Thu May 22 2014 - 19:16:57 EST
Applying restrictive seccomp filter programs to large or diverse
codebases often requires handling threads which may be started early in
the process lifetime (e.g., by code that is linked in). While it is
possible to apply permissive programs prior to process start up, it is
difficult to further restrict the kernel ABI to those threads after that
point.
This change adds a new seccomp extension action for synchronizing thread
group seccomp filters and a prctl() for accessing that functionality,
as well as a flag for SECCOMP_EXT_ACT_FILTER to perform sync at filter
installation time.
When calling prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT, SECCOMP_EXT_ACT_FILTER,
flags, filter) with flags containing SECCOMP_FILTER_TSYNC, or when calling
prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT, SECCOMP_EXT_ACT_TSYNC, 0, 0), it
will attempt to synchronize all threads in current's threadgroup to its
seccomp filter program. This is possible iff all threads are using a filter
that is an ancestor to the filter current is attempting to synchronize to.
NULL filters (where the task is running as SECCOMP_MODE_NONE) are also
treated as ancestors allowing threads to be transitioned into
SECCOMP_MODE_FILTER. If prctrl(PR_SET_NO_NEW_PRIVS, ...) has been set on the
calling thread, no_new_privs will be set for all synchronized threads too.
On success, 0 is returned. On failure, the pid of one of the failing threads
will be returned, with as many filters installed as possible.
The race conditions are against another thread calling TSYNC, another
thread performing a clone, and another thread changing its filter. The
seccomp write lock is sufficient for these cases, though the clone
case is assisted by the tasklist_lock so that new threads must have a
duplicate of its parent seccomp state when it appears on the tasklist.
Based on patches by Will Drewry.
Suggested-by: Julien Tinnes <jln@xxxxxxxxxxxx>
Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx>
---
Documentation/prctl/seccomp_filter.txt | 22 ++++++-
include/uapi/linux/seccomp.h | 4 ++
kernel/seccomp.c | 106 ++++++++++++++++++++++++++++++--
3 files changed, 126 insertions(+), 6 deletions(-)
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
index a5e47753b32d..2924a3450870 100644
--- a/Documentation/prctl/seccomp_filter.txt
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -235,5 +235,23 @@ prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT, SECCOMP_EXT_ACT_FILTER, flags, prog):
Attach filter, with flags.
This is the same as prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog)
- except with the addition of optional "flags" argument. No flags
- are currently recognized.
+ except with the addition of optional "flags" argument:
+
+ SECCOMP_FILTER_TSYNC:
+ After installing filter, perform threadgroup sync, as
+ described below for SECCOMP_EXT_ACT_TSYNC.
+
+prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT, SECCOMP_EXT_ACT_TSYNC, 0, 0):
+ Thread synchronization.
+
+ The current thread requests to synchronize all threads in current's
+ threadgroup to its seccomp filter program. This is possible iff all
+ threads are using a filter that is an ancestor to the filter current
+ is attempting to synchronize to, or the thread has not yet entered
+ seccomp. If prctrl(PR_SET_NO_NEW_PRIVS, ...) has been set on the
+ calling thread, no_new_privs will be set for all synchronized threads
+ too.
+
+ On success, 0 is returned. On failure, all synchronizable threads
+ will have been synchronized, and the pid of one of the failing
+ threads will be returned.
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index d7ad626c684d..7f4431b90fd4 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -15,6 +15,10 @@
/* Valid extension actions as arg3 to prctl(PR_SECCOMP_EXT, SECCOMP_EXT_ACT) */
#define SECCOMP_EXT_ACT_FILTER 1 /* apply seccomp-bpf filter with flags */
+#define SECCOMP_EXT_ACT_TSYNC 2 /* synchronize threadgroup filters */
+
+/* Flags for prctl arg4 when calling SECCOMP_EXT_ACT_FILTER */
+#define SECCOMP_FILTER_TSYNC 1 /* synchronize threadgroup to filter */
/*
* All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 088244b2c765..d39c0dad9655 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -25,6 +25,7 @@
#ifdef CONFIG_SECCOMP_FILTER
#include <asm/syscall.h>
#include <linux/filter.h>
+#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/tracehook.h>
@@ -198,6 +199,93 @@ static u32 seccomp_run_filters(int syscall)
return ret;
}
+/* Returns 1 if the candidate is an ancestor. */
+static int is_ancestor(struct seccomp_filter *candidate,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (candidate == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == candidate)
+ return 1;
+ return 0;
+}
+
+/* Expects locking and sync suitability to have been done already. */
+static void seccomp_sync_thread(struct task_struct *caller,
+ struct task_struct *thread)
+{
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ thread->seccomp.filter = caller->seccomp.filter;
+ /* Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ thread->seccomp.mode = SECCOMP_MODE_FILTER;
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+ set_tsk_thread_flag(thread, TIF_SECCOMP);
+ }
+}
+
+/**
+ * seccomp_act_sync_threads: sets all threads to use current's filter
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static pid_t seccomp_act_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+ unsigned long tflags;
+ pid_t failed = 0;
+
+ if (current->seccomp.mode != SECCOMP_MODE_FILTER)
+ return -EACCES;
+
+ write_lock_irqsave(&tasklist_lock, tflags);
+ thread = caller = current;
+ while_each_thread(caller, thread) {
+ unsigned long irqflags;
+ seccomp_lock(thread, &irqflags);
+ /*
+ * Validate thread being eligible for synchronization.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter))) {
+ seccomp_sync_thread(caller, thread);
+ } else {
+ /* Keep the last sibling that failed to return. */
+ failed = task_pid_vnr(thread);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (failed == 0)
+ failed = -ESRCH;
+ }
+ seccomp_unlock(thread, irqflags);
+ }
+ write_unlock_irqrestore(&tasklist_lock, tflags);
+ return failed;
+}
+
/**
* seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
@@ -340,19 +428,24 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter)
* @flags: flags from SECCOMP_FILTER_* to change behavior
* @filter: struct sock_fprog for use with SECCOMP_MODE_FILTER
*
- * Return 0 on success, -ve on error.
+ * Return 0 on success, -ve on error, or thread pid that caused failures.
*/
static long seccomp_act_filter(unsigned long flags, char * __user filter)
{
long ret;
- /* No flags currently recognized. */
- if (flags != 0)
+ /* Only SECCOMP_FILTER_TSYNC is recognized. */
+ if ((flags & ~(SECCOMP_FILTER_TSYNC)) != 0)
return -EINVAL;
ret = seccomp_set_mode(SECCOMP_MODE_FILTER, filter);
+ if (ret)
+ return ret;
- return ret;
+ if (flags & SECCOMP_FILTER_TSYNC)
+ return seccomp_act_sync_threads();
+
+ return 0;
}
/**
@@ -368,6 +461,11 @@ static long seccomp_extended_action(int action, unsigned long arg1,
switch (action) {
case SECCOMP_EXT_ACT_FILTER:
return seccomp_act_filter(arg1, (char * __user)arg2);
+ case SECCOMP_EXT_ACT_TSYNC:
+ /* arg1 and arg2 are currently unused. */
+ if (arg1 || arg2)
+ return -EINVAL;
+ return seccomp_act_sync_threads();
default:
break;
}
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/