Re: eBPF / seccomp globals?

From: Michael Tirado
Date: Tue Oct 06 2015 - 12:00:50 EST


On Tue, Sep 29, 2015 at 11:44 PM, Kees Cook <keescook@xxxxxxxxxxxx> wrote:
> On Thu, Sep 10, 2015 at 2:55 PM, Michael Tirado <mtirado418@xxxxxxxxx> wrote:
>> On Fri, Sep 4, 2015 at 8:37 PM, Kees Cook <keescook@xxxxxxxxxxxx> wrote:
>>>
>> @@ -196,7 +197,12 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
>> * value always takes priority (ignoring the DATA).
>> */
>> for (; f; f = f->prev) {
>> - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
>> + u32 cur_ret;
>> + if (unlikely(f->deferred)) {
>> + --f->deferred;
>> + continue;
>> + }
>> + cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
>
> I do like the idea of a deferred filters, but I wouldn't want them
> checked this way (it adds a fixed cost to all filters). I would rather
> that a separate list of filters exist, waiting for something like
> "exec" to trigger them getting added to the actual filter list. In
> fact, probably you'd want something like "prepare" and "cancel" too,
> then you could "prepare", fork, exec. Then the child would get the
> prepared filters added, and the parent could call "cancel" to drop the
> prepared filters from itself.
>
> Andy, Will, do you see issues with a class of "deferred" filters that
> would be attached after exec?
>


Thanks for the feedback.
Like Andy said, it's definitely not beautiful, but I decided anyway
to spend some time writing a more legitimate version. It seems that unless
we can make a new temporary mode for detecting execve, the check
on SECCOMP_RET_ALLOW return is unavoidable. The comments seem to forbid mode
changes, so I did not test if that approach would be more optimized. This new
version only checks once unlike the previous that checked for each filter.

Again, sorry about the formatting, I can re-send this the proper way if
interested, or let me know of any other ideas for solving this problem.



Subject: [PATCH] seccomp: Add deferred filter flag

---
include/asm-generic/seccomp.h | 2 +
include/linux/seccomp.h | 4 +-
include/uapi/linux/seccomp.h | 2 +-
kernel/seccomp.c | 52 +++++++++-
tools/testing/selftests/seccomp/Makefile | 1 +
tools/testing/selftests/seccomp/seccomp_bpf.c | 105 +++++++++++++++++++++
.../testing/selftests/seccomp/seccomp_defer_test.c | 15 +++
7 files changed, 176 insertions(+), 5 deletions(-)
create mode 100644 tools/testing/selftests/seccomp/seccomp_defer_test.c

diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h
index c9ccafa..5dd7e7d 100644
--- a/include/asm-generic/seccomp.h
+++ b/include/asm-generic/seccomp.h
@@ -29,4 +29,6 @@
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
#endif

+#define __NR_seccomp_execve __NR_execve
+
#endif /* _ASM_GENERIC_SECCOMP_H */
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index a19ddac..8ab7a68 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,7 +3,8 @@

#include <uapi/linux/seccomp.h>

-#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC)
+#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
+ SECCOMP_FILTER_FLAG_DEFER)

#ifdef CONFIG_SECCOMP

@@ -25,6 +26,7 @@ struct seccomp_filter;
struct seccomp {
int mode;
struct seccomp_filter *filter;
+ struct seccomp_filter *deferred;
};

#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a4..01fab07 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,7 +16,7 @@

/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC 1
-
+#define SECCOMP_FILTER_FLAG_DEFER 2
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 245df6b..6d54066 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -176,11 +176,12 @@ static int seccomp_check_filter(struct
sock_filter *filter, unsigned int flen)
static u32 seccomp_run_filters(struct seccomp_data *sd)
{
struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
+ struct seccomp_filter *d = ACCESS_ONCE(current->seccomp.deferred);
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;

/* Ensure unexpected behavior doesn't result in failing open. */
- if (unlikely(WARN_ON(f == NULL)))
+ if (unlikely(WARN_ON(f == NULL && d == NULL)))
return SECCOMP_RET_KILL;

/* Make sure cross-thread synced filter points somewhere sane. */
@@ -447,8 +448,14 @@ static long seccomp_attach_filter(unsigned int flags,
* If there is an existing filter, make it the prev and don't drop its
* task reference.
*/
- filter->prev = current->seccomp.filter;
- current->seccomp.filter = filter;
+ if (flags & SECCOMP_FILTER_FLAG_DEFER) {
+ filter->prev = current->seccomp.deferred;
+ current->seccomp.deferred = filter;
+ }
+ else {
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+ }

/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -487,6 +494,35 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}

+/* attach any filters waiting in deferred list */
+static long seccomp_attach_deferred(void)
+{
+ struct seccomp_filter *tmp;
+ int ret;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ while(current->seccomp.deferred) {
+ tmp = current->seccomp.deferred->prev;
+ ret = seccomp_attach_filter(0, current->seccomp.deferred);
+ if (ret)
+ goto out_free;
+ current->seccomp.deferred = tmp;
+ }
+
+ spin_unlock_irq(&current->sighand->siglock);
+ return 0;
+
+out_free:
+ while(current->seccomp.deferred) {
+ tmp = current->seccomp.deferred->prev;
+ seccomp_filter_free(current->seccomp.deferred);
+ current->seccomp.deferred = tmp;
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+ return ret;
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
@@ -605,6 +641,12 @@ static u32 __seccomp_phase1_filter(int
this_syscall, struct seccomp_data *sd)
return filter_ret; /* Save the rest for phase 2. */

case SECCOMP_RET_ALLOW:
+ if (unlikely(current->seccomp.deferred) &&
+ this_syscall == __NR_seccomp_execve) {
+ if (seccomp_attach_deferred()) {
+ do_exit(SIGKILL);
+ }
+ }
return SECCOMP_PHASE1_OK;

case SECCOMP_RET_KILL:
@@ -815,6 +857,10 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return -EINVAL;
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+ flags & SECCOMP_FILTER_FLAG_DEFER) {
+ return -EINVAL;
+ }
return seccomp_set_mode_filter(flags, uargs);
default:
return -EINVAL;
diff --git a/tools/testing/selftests/seccomp/Makefile
b/tools/testing/selftests/seccomp/Makefile
index 8401e87..69bf673 100644
--- a/tools/testing/selftests/seccomp/Makefile
+++ b/tools/testing/selftests/seccomp/Makefile
@@ -1,4 +1,5 @@
TEST_PROGS := seccomp_bpf
+TEST_PROGS += seccomp_defer_test
CFLAGS += -Wl,-no-as-needed -Wall
LDFLAGS += -lpthread

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index c5abe7f..946060a 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -2095,6 +2095,111 @@ TEST(syscall_restart)
_metadata->passed = 0;
}

+TEST(seccomp_deferred)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mmap2, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mprotect, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_brk, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_open, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_close, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_munmap, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigaction, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigprocmask, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_fstat64, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_set_thread_area, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ENOSYS),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ uid_t uid;
+ int status;
+ pid_t pid;
+ int i;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* fail if tsync is set */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_TSYNC | SECCOMP_FILTER_FLAG_DEFER,
+ &prog);
+ EXPECT_NE(0, ret) {
+ TH_LOG("tsync + defer not implemented");
+ }
+
+ pid = fork();
+ if (pid == 0) { /* exec thread */
+ char *env[] = {NULL};
+ char *arg[] = {NULL};
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_DEFER,
+ &prog);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("seccomp syscall failed %s", strerror(errno));
+ }
+
+ /* should allow any system call, try one! */
+ EXPECT_NE(-1, (uid = getuid())) {
+ TH_LOG("getuid error: %s\n", strerror(errno));
+ }
+
+ /*exec'd program should return 57 when it fails write syscall*/
+ if (execve("seccomp_defer_test", arg, env)) {
+ ASSERT_EQ(0, 1) {
+ printf("exec failed: %s\n", strerror(errno));
+ }
+ }
+ abort();
+ }
+ EXPECT_NE(-1, pid) {
+ printf("fork error: %s\n", strerror(errno));
+ }
+
+ for (i = 0; i < 20; ++i) {
+ ret = waitpid(pid, &status, WNOHANG);
+ EXPECT_NE(-1, ret) {
+ TH_LOG("waitpid error: %s\n", strerror(errno));
+ }
+ if (ret == pid) {
+ EXPECT_NE(0, WIFEXITED(status)) {
+ printf("process did not exit normally\n");
+ }
+ EXPECT_EQ(57, WEXITSTATUS(status)) {
+ printf("unexpected exit status: %d\n",
+ WEXITSTATUS(status));
+ }
+ break;
+ }
+ usleep(25000);
+ }
+ EXPECT_NE(i, 20) {
+ printf("process timed out\n");
+ }
+}
+
/*
* TODO:
* - add microbenchmarks
diff --git a/tools/testing/selftests/seccomp/seccomp_defer_test.c
b/tools/testing/selftests/seccomp/seccomp_defer_test.c
new file mode 100644
index 0000000..1c62581
--- /dev/null
+++ b/tools/testing/selftests/seccomp/seccomp_defer_test.c
@@ -0,0 +1,15 @@
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+int main(int argc, char *argv[])
+{
+ char str[] = "this should fail\n";
+ if (write(STDOUT_FILENO, str, sizeof(str)) == -1) {
+ if (errno == ENOSYS) {
+ return 57; /* expected behavior */
+ }
+ }
+ return -1;
+}
--
1.8.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/