Re: [PATCH 3/6] seccomp: Implement constant action bitmaps

From: Jann Horn
Date: Wed Sep 23 2020 - 20:26:00 EST


On Thu, Sep 24, 2020 at 1:29 AM Kees Cook <keescook@xxxxxxxxxxxx> wrote:
> One of the most common pain points with seccomp filters has been dealing
> with the overhead of processing the filters, especially for "always allow"
> or "always reject" cases.

The "always reject" cases don't need to be fast, in particular not the
kill_thread/kill_process ones. Nobody's going to have "process kills
itself by executing a forbidden syscall" on a critical hot codepath.

> While BPF is extremely fast[1], it will always
> have overhead associated with it. Additionally, due to seccomp's design,
> filters are layered, which means processing time goes up as the number
> of filters attached goes up.
[...]
> In order to build this mapping at filter attach time, each filter is
> executed for every syscall (under each possible architecture), and
> checked for any accesses of struct seccomp_data that are not the "arch"
> nor "nr" (syscall) members. If only "arch" and "nr" are examined, then
> there is a constant mapping for that syscall, and bitmaps can be updated
> accordingly. If any accesses happen outside of those struct members,
> seccomp must not bypass filter execution for that syscall, since program
> state will be used to determine filter action result. (This logic comes
> in the next patch.)
>
> [1] https://lore.kernel.org/bpf/20200531171915.wsxvdjeetmhpsdv2@xxxxxxxxxxxxxxxxxxxxxxxxxxxx/
> [2] https://lore.kernel.org/bpf/20200601101137.GA121847@gardel-login/
> [3] https://lore.kernel.org/bpf/717a06e7f35740ccb4c70470ec70fb2f@xxxxxxxxxx/
>
> Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx>
> ---
> include/linux/seccomp.h | 18 ++++
> kernel/seccomp.c | 207 +++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 221 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 0be20bc81ea9..96df2f899e3d 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -25,6 +25,17 @@
> #define SECCOMP_ARCH_IS_MULTIPLEX 3
> #define SECCOMP_ARCH_IS_UNKNOWN 0xff
>
> +/* When no bits are set for a syscall, filters are run. */
> +struct seccomp_bitmaps {
> +#ifdef SECCOMP_ARCH
> + /* "allow" are initialized to set and only ever get cleared. */
> + DECLARE_BITMAP(allow, NR_syscalls);

This bitmap makes sense.

The "NR_syscalls" part assumes that the compat syscall tables will not
be bigger than the native syscall table, right? I guess that's usually
mostly true nowadays, thanks to the syscall table unification...
(might be worth a comment though)

> + /* These are initialized to clear and only ever get set. */
> + DECLARE_BITMAP(kill_thread, NR_syscalls);
> + DECLARE_BITMAP(kill_process, NR_syscalls);

I don't think these bitmaps make sense, this is not part of any fastpath.

(However, a "which syscalls have a fixed result" bitmap might make
sense if we want to export the list of permitted syscalls as a text
file in procfs, as I mentioned over at
<https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@xxxxxxxxxxxxxx/>.)

> +#endif
> +};
> +
> struct seccomp_filter;
> /**
> * struct seccomp - the state of a seccomp'ed process
> @@ -45,6 +56,13 @@ struct seccomp {
> #endif
> atomic_t filter_count;
> struct seccomp_filter *filter;
> + struct seccomp_bitmaps native;
> +#ifdef CONFIG_COMPAT
> + struct seccomp_bitmaps compat;
> +#endif
> +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH
> + struct seccomp_bitmaps multiplex;
> +#endif

Why do we have one bitmap per thread (in struct seccomp) instead of
putting the bitmap for a given filter and all its ancestors into the
seccomp_filter?

> };
>
> #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 0a3ff8eb8aea..111a238bc532 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -318,7 +318,7 @@ static inline u8 seccomp_get_arch(u32 syscall_arch, u32 syscall_nr)
>
> #ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH
> if (syscall_arch == SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH) {
> - seccomp_arch |= (sd->nr & SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK) >>
> + seccomp_arch |= (syscall_nr & SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK) >>
> SECCOMP_MULTIPLEXED_SYSCALL_TABLE_SHIFT;

This belongs over into patch 1.

> }
> #endif
> @@ -559,6 +559,21 @@ static inline void seccomp_sync_threads(unsigned long flags)
> atomic_set(&thread->seccomp.filter_count,
> atomic_read(&thread->seccomp.filter_count));
>
> + /* Copy syscall filter bitmaps. */
> + memcpy(&thread->seccomp.native,
> + &caller->seccomp.native,
> + sizeof(caller->seccomp.native));
> +#ifdef CONFIG_COMPAT
> + memcpy(&thread->seccomp.compat,
> + &caller->seccomp.compat,
> + sizeof(caller->seccomp.compat));
> +#endif
> +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH
> + memcpy(&thread->seccomp.multiplex,
> + &caller->seccomp.multiplex,
> + sizeof(caller->seccomp.multiplex));
> +#endif

This part wouldn't be necessary if the bitmasks were part of the
seccomp_filter...

> /*
> * Don't let an unprivileged task work around
> * the no_new_privs restriction by creating
> @@ -661,6 +676,114 @@ seccomp_prepare_user_filter(const char __user *user_filter)
> return filter;
> }
>
> +static inline bool sd_touched(pte_t *ptep)
> +{
> + return !!pte_young(*(READ_ONCE(ptep)));
> +}

I think this is left over from the previous version and should've been removed?

[...]
> +/*
> + * Walk everyone syscall combination for this arch/mask combo and update

nit: "Walk every possible", or something like that

> + * the bitmaps with any results.
> + */
> +static void seccomp_update_bitmap(struct seccomp_filter *filter,
> + void *pagepair, u32 arch, u32 mask,
> + struct seccomp_bitmaps *bitmaps)
[...]
> @@ -970,6 +1097,65 @@ static int seccomp_do_user_notification(int this_syscall,
> return -1;
> }
>
> +#ifdef SECCOMP_ARCH
> +static inline bool __bypass_filter(struct seccomp_bitmaps *bitmaps,
> + u32 nr, u32 *filter_ret)
> +{
> + if (nr < NR_syscalls) {
> + if (test_bit(nr, bitmaps->allow)) {
> + *filter_ret = SECCOMP_RET_ALLOW;
> + return true;
> + }
> + if (test_bit(nr, bitmaps->kill_process)) {
> + *filter_ret = SECCOMP_RET_KILL_PROCESS;
> + return true;
> + }
> + if (test_bit(nr, bitmaps->kill_thread)) {
> + *filter_ret = SECCOMP_RET_KILL_THREAD;
> + return true;
> + }

The checks against ->kill_process and ->kill_thread won't make
anything faster, but since they will run in the fastpath, they'll
probably actually contribute to making things *slower*.

> + }
> + return false;
> +}
> +
> +static inline u32 check_syscall(const struct seccomp_data *sd,
> + struct seccomp_filter **match)
> +{
> + u32 filter_ret = SECCOMP_RET_KILL_PROCESS;
> + u8 arch = seccomp_get_arch(sd->arch, sd->nr);
> +
> + switch (arch) {
> + case SECCOMP_ARCH_IS_NATIVE:
> + if (__bypass_filter(&current->seccomp.native, sd->nr, &filter_ret))
> + return filter_ret;
> + break;
> +#ifdef CONFIG_COMPAT
> + case SECCOMP_ARCH_IS_COMPAT:
> + if (__bypass_filter(&current->seccomp.compat, sd->nr, &filter_ret))
> + return filter_ret;
> + break;
> +#endif
> +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH
> + case SECCOMP_ARCH_IS_MULTIPLEX:
> + if (__bypass_filter(&current->seccomp.multiplex, sd->nr, &filter_ret))
> + return filter_ret;
> + break;
> +#endif
> + default:
> + WARN_ON_ONCE(1);
> + return filter_ret;
> + };
> +
> + return seccomp_run_filters(sd, match);
> +}

You could write this in a less repetitive way, and especially if we
get rid of the kill_* masks, also more compact:

static inline u32 check_syscall(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
struct seccomp_bitmaps *bitmaps;
u32 filter_ret;

switch (arch) {
case SECCOMP_ARCH_IS_NATIVE:
bitmaps = &current->seccomp.native;
break;
#ifdef CONFIG_COMPAT
case SECCOMP_ARCH_IS_COMPAT:
bitmaps = &current->seccomp.compat;
break;
#endif
#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH
case SECCOMP_ARCH_IS_MULTIPLEX:
bitmaps = &current->seccomp.multiplex;
break;
#endif
default:
WARN_ON_ONCE(1);
return SECCOMP_RET_KILL_PROCESS;
}

if ((unsigned)sd->nr < __NR_syscalls && test_bit(sd->nr, bitmaps->allow))
return SECCOMP_RET_ALLOW;

return seccomp_run_filters(sd, match);
}

[...]
> @@ -1625,12 +1812,24 @@ static long seccomp_set_mode_filter(unsigned int flags,
> mutex_lock_killable(&current->signal->cred_guard_mutex))
> goto out_put_fd;
>
> + /*
> + * This memory will be needed for bitmap testing, but we'll
> + * be holding a spinlock at that point. Do the allocation
> + * (and free) outside of the lock.
> + *
> + * Alternative: we could do the bitmap update before attach
> + * to avoid spending too much time under lock.
> + */
> + pagepair = vzalloc(PAGE_SIZE * 2);
> + if (!pagepair)
> + goto out_put_fd;
> +
[...]
> - ret = seccomp_attach_filter(flags, prepared);
> + ret = seccomp_attach_filter(flags, prepared, pagepair);

You probably intended to rip this stuff back out? AFAIU the vzalloc()
stuff is a remnant from the old version that relied on MMU trickery.