[PATCH v3 seccomp 2/5] seccomp/cache: Add "emulator" to check if filter is constant allow

From: YiFei Zhu
Date: Wed Sep 30 2020 - 11:22:47 EST


From: YiFei Zhu <yifeifz2@xxxxxxxxxxxx>

SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not
access any syscall arguments or instruction pointer. To facilitate
this we need a static analyser to know whether a filter will
return allow regardless of syscall arguments for a given
architecture number / syscall number pair. This is implemented
here with a pseudo-emulator, and stored in a per-filter bitmap.

Each common BPF instruction are emulated. Any weirdness or loading
from a syscall argument will cause the emulator to bail.

The emulation is also halted if it reaches a return. In that case,
if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good.

Emulator structure and comments are from Kees [1] and Jann [2].

Emulation is done at attach time. If a filter depends on more
filters, and if the dependee does not guarantee to allow the
syscall, then we skip the emulation of this syscall.

[1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@xxxxxxxxxxxx/
[2] https://lore.kernel.org/lkml/CAG48ez1p=dR_2ikKq=xVxkoGg0fYpTBpkhJSv1w-6BG=76PAvw@xxxxxxxxxxxxxx/

Signed-off-by: YiFei Zhu <yifeifz2@xxxxxxxxxxxx>
---
arch/Kconfig | 34 ++++++++++
arch/x86/Kconfig | 1 +
kernel/seccomp.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 21a3675a7a3a..ca867b2a5d71 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -471,6 +471,14 @@ config HAVE_ARCH_SECCOMP_FILTER
results in the system call being skipped immediately.
- seccomp syscall wired up

+config HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
+ bool
+ help
+ An arch should select this symbol if it provides all of these things:
+ - all the requirements for HAVE_ARCH_SECCOMP_FILTER
+ - SECCOMP_ARCH_DEFAULT
+ - SECCOMP_ARCH_DEFAULT_NR
+
config SECCOMP
prompt "Enable seccomp to safely execute untrusted bytecode"
def_bool y
@@ -498,6 +506,32 @@ config SECCOMP_FILTER

See Documentation/userspace-api/seccomp_filter.rst for details.

+choice
+ prompt "Seccomp filter cache"
+ default SECCOMP_CACHE_NONE
+ depends on SECCOMP_FILTER
+ depends on HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
+ help
+ Seccomp filters can potentially incur large overhead for each
+ system call. This can alleviate some of the overhead.
+
+ If in doubt, select 'syscall numbers only'.
+
+config SECCOMP_CACHE_NONE
+ bool "None"
+ help
+ No caching is done. Seccomp filters will be called each time
+ a system call occurs in a seccomp-guarded task.
+
+config SECCOMP_CACHE_NR_ONLY
+ bool "Syscall number only"
+ depends on HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
+ help
+ For each syscall number, if the seccomp filter has a fixed
+ result, store that result in a bitmap to speed up system calls.
+
+endchoice
+
config HAVE_ARCH_STACKLEAK
bool
help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1ab22869a765..ff5289228ea5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,6 +150,7 @@ config X86
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
+ select HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ae6b40cc39f4..f09c9e74ae05 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,37 @@ struct notification {
struct list_head notifications;
};

+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_cache_filter_data - container for cache's per-filter data
+ *
+ * Tis struct is ordered to minimize padding holes.
+ *
+ * @syscall_allow_default: A bitmap where each bit represents whether the
+ * filter willalways allow the syscall, for the
+ * default architecture.
+ * @syscall_allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct seccomp_cache_filter_data {
+#ifdef SECCOMP_ARCH_DEFAULT
+ DECLARE_BITMAP(syscall_allow_default, SECCOMP_ARCH_DEFAULT_NR);
+#endif
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(syscall_allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+
+#define SECCOMP_EMU_MAX_PENDING_STATES 64
+#else
+struct seccomp_cache_filter_data { };
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -159,6 +190,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
+ * @cache: container for cache-related data.
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
@@ -180,6 +212,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ struct seccomp_cache_filter_data cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
@@ -544,7 +577,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY);

if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -610,6 +644,136 @@ seccomp_prepare_user_filter(const char __user *user_filter)
return filter;
}

+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * seccomp_emu_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number are arch
+ * number are considered constant.
+ */
+static bool seccomp_emu_is_const_allow(struct sock_fprog_kern *fprog,
+ struct seccomp_data *sd)
+{
+ unsigned int insns;
+ unsigned int reg_value = 0;
+ unsigned int pc;
+ bool op_res;
+
+ if (WARN_ON_ONCE(!fprog))
+ return false;
+
+ insns = bpf_classic_proglen(fprog);
+ for (pc = 0; pc < insns; pc++) {
+ struct sock_filter *insn = &fprog->filter[pc];
+ u16 code = insn->code;
+ u32 k = insn->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ switch (k) {
+ case offsetof(struct seccomp_data, nr):
+ reg_value = sd->nr;
+ break;
+ case offsetof(struct seccomp_data, arch):
+ reg_value = sd->arch;
+ break;
+ default:
+ /* can't optimize (non-constant value load) */
+ return false;
+ }
+ break;
+ case BPF_RET | BPF_K:
+ /* reached return with constant values only, check allow */
+ return k == SECCOMP_RET_ALLOW;
+ case BPF_JMP | BPF_JA:
+ pc += insn->k;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ op_res = reg_value == k;
+ break;
+ case BPF_JGE:
+ op_res = reg_value >= k;
+ break;
+ case BPF_JGT:
+ op_res = reg_value > k;
+ break;
+ case BPF_JSET:
+ op_res = !!(reg_value & k);
+ break;
+ default:
+ /* can't optimize (unknown jump) */
+ return false;
+ }
+
+ pc += op_res ? insn->jt : insn->jf;
+ break;
+ case BPF_ALU | BPF_AND | BPF_K:
+ reg_value &= k;
+ break;
+ default:
+ /* can't optimize (unknown insn) */
+ return false;
+ }
+ }
+
+ /* ran off the end of the filter?! */
+ WARN_ON(1);
+ return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+ void *bitmap, const void *bitmap_prev,
+ size_t bitmap_size, int arch)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct seccomp_data sd;
+ int nr;
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ if (bitmap_prev && !test_bit(nr, bitmap_prev))
+ continue;
+
+ sd.nr = nr;
+ sd.arch = arch;
+
+ if (seccomp_emu_is_const_allow(fprog, &sd))
+ set_bit(nr, bitmap);
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct seccomp_cache_filter_data *cache = &sfilter->cache;
+ const struct seccomp_cache_filter_data *cache_prev =
+ sfilter->prev ? &sfilter->prev->cache : NULL;
+
+#ifdef SECCOMP_ARCH_DEFAULT
+ seccomp_cache_prepare_bitmap(sfilter, cache->syscall_allow_default,
+ cache_prev ? cache_prev->syscall_allow_default : NULL,
+ SECCOMP_ARCH_DEFAULT_NR,
+ SECCOMP_ARCH_DEFAULT);
+#endif /* SECCOMP_ARCH_DEFAULT */
+
+#ifdef SECCOMP_ARCH_COMPAT
+ seccomp_cache_prepare_bitmap(sfilter, cache->syscall_allow_compat,
+ cache_prev ? cache_prev->syscall_allow_compat : NULL,
+ SECCOMP_ARCH_COMPAT_NR,
+ SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -659,6 +823,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);

--
2.28.0