Re: [PATCH v10 09/12] arch/x86: enable task isolation functionality

From: Chris Metcalf
Date: Tue Mar 08 2016 - 15:40:57 EST


On 03/07/2016 03:55 PM, Andy Lutomirski wrote:
Let task isolation users who want to detect when they screw up and do
>>a syscall do it with seccomp.

>Can you give me more details on what you're imagining here? Remember
>that a key use case is that these applications can remove the syscall
>prohibition voluntarily; it's only there to prevent unintended uses
>(by third party libraries or just straight-up programming bugs).
>As far as I can tell, seccomp does not allow you to go from "less
>permissive" to "more permissive" settings at all, which means that as
>it exists, it's not a good solution for this use case.
>
>Or were you thinking about a new seccomp API that allows this?
I was. This is at least the second time I've wanted a way to ask
seccomp to allow a layer to be removed.

Andy,

Please take a look at this draft patch that intends to enable seccomp
as something that task isolation can use.

The basic idea is to add a notion of "removable" seccomp filters.
You can tag a filter that way when installing it (using the new
SECCOMP_FILTER_FLAG_REMOVABLE flag bit for SECCOMP_SET_MODE_FILTER),
and if the most recently-added filter is marked as removable, you can
remove it with the new SECCOMP_POP_FILTER operation. It is currently
implemented to be incompatible with SECCOMP_FILTER_FLAG_TSYNC, which
is plausible since the obvious use is for thread-local push and pop,
but the API allows for future implementation by including a flag word
with the pop_filter operation (now always zero).

I did not make this supported via the prctl() since the "removable"
flag requires seccomp(), so making pop work with prctl() seemed silly.

One interesting result of this is that now it is no longer true
that once current->seccomp.mode becomes non-zero, it may not be
changed, since it can now be changed back to DISABLED when you push a
removable filter and later pop it.

My preference would be not to have to require all task-isolation users
to also figure out all the complexities of creating BPF programs, so
my intention is to have task isolation automatically generate a BPF
program (just allowing prctl/exit/exit_group and failing everything
else with SIGSYS). To support having it work this way, I open up
the seccomp stuff a little so that kernel clients can effectively
push/pop a BPF program into seccomp:

long seccomp_push_filter(unsigned int flags, struct bpf_prog *fp)
long seccomp_pop_filter(unsigned int flags);

We mark filters from this API with a new "extern_prog" boolean in the
seccomp_filter struct so the BPF program isn't freed when the
seccomp_filter itself is freed. Note that doing it this way avoids
having to go through the substantial overhead of creating a brand-new
BPF filter every time we enter task isolation mode.

Not shown here is the additional code needed in task isolation to
create a suitable BPF program and then push and pop it as we go in and
out of task isolation mode.

For what it's worth, I'm a little dubious about the tradeoff of adding
a substantial chunk of code to seccomp to handle what the v10 task
isolation code did with a single extra TIF flag test and a dozen lines
of code that got called. But given that you said there were other
potential users for the "filter pop" idea, it may indeed make sense.

This is still all untested, but I wanted to get your sense of whether
this was even going in the right direction before spending more time
on it.

Thanks!

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2296e6b2f690..feeba7a23d20 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,13 +3,15 @@
#include <uapi/linux/seccomp.h>
-#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC)
+#define SECCOMP_FILTER_FLAG_MASK \
+ (SECCOMP_FILTER_FLAG_TSYNC | SECCOMP_FILTER_FLAG_REMOVABLE)
#ifdef CONFIG_SECCOMP
#include <linux/thread_info.h>
#include <asm/seccomp.h>
+struct bpf_prog;
struct seccomp_filter;
/**
* struct seccomp - the state of a seccomp'ed process
@@ -41,6 +43,8 @@ static inline int secure_computing(void)
extern u32 seccomp_phase1(struct seccomp_data *sd);
int seccomp_phase2(u32 phase1_result);
+long seccomp_push_filter(unsigned int flags, struct bpf_prog *fp);
+long seccomp_pop_filter(unsigned int flags);
#else
extern void secure_computing_strict(int this_syscall);
#endif
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a43ff1e..6e65ac2a7262 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -13,9 +13,11 @@
/* Valid operations for seccomp syscall. */
#define SECCOMP_SET_MODE_STRICT 0
#define SECCOMP_SET_MODE_FILTER 1
+#define SECCOMP_POP_FILTER 2
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC 1
+#define SECCOMP_FILTER_FLAG_REMOVABLE 2
/*
* All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15a1795bbba1..c22eb3a56556 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,8 +41,9 @@
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
* @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
- * @insnsi: the BPF program instructions to evaluate
+ * @prog: the BPF program to evaluate
+ * @removable: if this filter is removable with seccomp_pop_filter()
+ * @extern_prog: if @prog should not be freed in seccomp_free_filter()
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -58,6 +59,8 @@ struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
+ bool removable;
+ bool extern_prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -470,7 +473,8 @@ void get_seccomp_filter(struct task_struct *tsk)
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
if (filter) {
- bpf_prog_destroy(filter->prog);
+ if (!filter->extern_prog)
+ bpf_prog_destroy(filter->prog);
kfree(filter);
}
}
@@ -722,6 +726,7 @@ long prctl_get_seccomp(void)
* seccomp_set_mode_strict: internal function for setting strict seccomp
*
* Once current->seccomp.mode is non-zero, it may not be changed.
+ * (other than to reset to DISABLED after removing the last removable filter).
*
* Returns 0 on success or -EINVAL on failure.
*/
@@ -749,33 +754,34 @@ out:
#ifdef CONFIG_SECCOMP_FILTER
/**
- * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * do_push_filter: internal function for setting seccomp filter
* @flags: flags to change filter behavior
- * @filter: struct sock_fprog containing filter
+ * @prepared: struct seccomp_filter to install
*
* This function may be called repeatedly to install additional filters.
* Every filter successfully installed will be evaluated (in reverse order)
* for each system call the task makes.
*
- * Once current->seccomp.mode is non-zero, it may not be changed.
+ * Once current->seccomp.mode is non-zero, it may not be changed
+ * (other than to reset to DISABLED after removing the last removable filter).
*
* Returns 0 on success or -EINVAL on failure.
*/
-static long seccomp_set_mode_filter(unsigned int flags,
- const char __user *filter)
+long do_push_filter(unsigned int flags, struct seccomp_filter *prepared)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
- struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
/* Validate flags. */
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
- /* Prepare the new filter before holding any locks. */
- prepared = seccomp_prepare_user_filter(filter);
- if (IS_ERR(prepared))
- return PTR_ERR(prepared);
+ if (flags & SECCOMP_FILTER_FLAG_REMOVABLE) {
+ /* The intended use case is for thread-local push/pop. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ goto out_free;
+ prepared->removable = true;
+ }
/*
* Make sure we cannot change seccomp or nnp state via TSYNC
@@ -805,12 +811,87 @@ out_free:
seccomp_filter_free(prepared);
return ret;
}
+
+static long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
+{
+ struct seccomp_filter *prepared;
+
+ /* Prepare the new filter before holding any locks. */
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
+ return seccomp_push_filter(flags, prepared);
+}
+
+long seccomp_push_filter(unsigned int flags, struct bpf_prog *fp)
+{
+ struct seccomp_filter *sfilter;
+
+ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL);
+ if (!sfilter)
+ return ERR_PTR(-ENOMEM);
+
+ sfilter->prog = fp;
+ sfilter->extern_prog = true;
+ atomic_set(&sfilter->usage, 1);
+
+ return do_push_filter(flags, sfilter);
+}
+
+/**
+ * seccomp_pop_filter: internal function for removing filter
+ * @flags: flags to change pop behavior
+ *
+ * This function removes the most recently installed filter, if it was
+ * installed with the SECCOMP_FILTER_FLAG_REMOVABLE flag. Any previously
+ * installed filters are left intact.
+ *
+ * If the last filter is removed, the seccomp state reverts to DISABLED.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long seccomp_pop_filter(unsigned int flags)
+{
+ struct seccomp_filter *filter;
+
+ /* The intended use case is for temporary thread-local push/pop. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ return -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (current->seccomp.mode != SECCOMP_MODE_FILTER)
+ goto out;
+
+ filter = current->seccomp.filter;
+ if (unlikely(WARN_ON(filter == NULL)) || !filter->removable)
+ goto out;
+
+ if (filter->prev == NULL) {
+ clear_tsk_thread_flag(current, TIF_SECCOMP);
+ current->seccomp.mode = SECCOMP_MODE_DISABLED;
+ }
+
+ current->seccomp.filter = filter->prev;
+
+ spin_unlock_irq(&current->sighand->siglock);
+ seccomp_filter_free(filter);
+ return 0;
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+ return -EINVAL;
+}
#else
static inline long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
return -EINVAL;
}
+static inline long seccomp_pop_filter(unsigned int flags)
+{
+ return -EINVAL;
+}
#endif
/* Common entry point for both prctl and syscall. */
@@ -824,6 +905,8 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
return seccomp_set_mode_filter(flags, uargs);
+ case SECCOMP_POP_FILTER:
+ return seccomp_pop_filter(flags);
default:
return -EINVAL;
}

--
Chris Metcalf, Mellanox Technologies
http://www.mellanox.com