Re: [PATCH 1/2] riscv: add support for SECCOMP incl. filters

From: Kees Cook
Date: Thu Dec 06 2018 - 11:48:11 EST


On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
<david.abdurachmanov@xxxxxxxxx> wrote:
>
> The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
>
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@xxxxxxxxx>
> ---
> arch/riscv/Kconfig | 14 ++++++++++++++
> arch/riscv/include/asm/thread_info.h | 5 ++++-
> arch/riscv/kernel/entry.S | 27 +++++++++++++++++++++++++--
> arch/riscv/kernel/ptrace.c | 8 ++++++++
> 4 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index a4f48f757204..49cd8e251547 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -29,6 +29,7 @@ config RISCV
> select GENERIC_SMP_IDLE_THREAD
> select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
> select HAVE_ARCH_AUDITSYSCALL
> + select HAVE_ARCH_SECCOMP_FILTER
> select HAVE_MEMBLOCK_NODE_MAP
> select HAVE_DMA_CONTIGUOUS
> select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -228,6 +229,19 @@ menu "Kernel features"
>
> source "kernel/Kconfig.hz"
>
> +config SECCOMP
> + bool "Enable seccomp to safely compute untrusted bytecode"
> + help
> + This kernel feature is useful for number crunching applications
> + that may need to compute untrusted bytecode during their
> + execution. By using pipes or other transports made available to
> + the process as file descriptors supporting the read/write
> + syscalls, it's possible to isolate those applications in
> + their own address space using seccomp. Once seccomp is
> + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> + and the task is only allowed to execute a few safe syscalls
> + defined by each seccomp mode.
> +
> endmenu
>
> menu "Boot options"
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 1c9cc8389928..1fd6e4130cab 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -81,6 +81,7 @@ struct thread_info {
> #define TIF_MEMDIE 5 /* is terminating due to OOM killer */
> #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
> #define TIF_SYSCALL_AUDIT 7 /* syscall auditing */
> +#define TIF_SECCOMP 8 /* syscall secure computing */
>
> #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
> #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
> @@ -88,11 +89,13 @@ struct thread_info {
> #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
> #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
> #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP (1 << TIF_SECCOMP)
>
> #define _TIF_WORK_MASK \
> (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
> #define _TIF_SYSCALL_WORK \
> - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT \
> + _TIF_SECCOMP )
>
> #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index 355166f57205..e88ccbfa61ee 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -207,8 +207,25 @@ check_syscall_nr:
> /* Check to make sure we don't jump to a bogus syscall number. */
> li t0, __NR_syscalls
> la s0, sys_ni_syscall
> - /* Syscall number held in a7 */
> - bgeu a7, t0, 1f
> + /*
> + * The tracer can change syscall number to valid/invalid value.
> + * We use syscall_set_nr helper in syscall_trace_enter thus we
> + * cannot trust the current value in a7 and have to reload from
> + * the current task pt_regs.
> + */
> + REG_L a7, PT_A7(sp)
> + /*
> + * Syscall number held in a7.
> + * If syscall number is above allowed value, redirect to ni_syscall.
> + */
> + bge a7, t0, 1f
> + /*
> + * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> + * If yes, we pretend it was executed.
> + */
> + li t1, -1
> + beq a7, t1, ret_from_syscall_rejected
> + /* Call syscall */
> la s0, sys_call_table
> slli t0, a7, RISCV_LGPTR
> add s0, s0, t0
> @@ -219,6 +236,12 @@ check_syscall_nr:
> ret_from_syscall:
> /* Set user a0 to kernel a0 */
> REG_S a0, PT_A0(sp)
> + /*
> + * We didn't execute the actual syscall.
> + * Seccomp already set return value for the current task pt_regs.
> + * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> + */
> +ret_from_syscall_rejected:
> /* Trace syscalls, but only if requested by the user. */
> REG_L t0, TASK_TI_FLAGS(tp)
> andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index c1b51539c3e2..598e48b8ca2b 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -160,6 +160,14 @@ void do_syscall_trace_enter(struct pt_regs *regs)
> if (tracehook_report_syscall_entry(regs))
> syscall_set_nr(current, regs, -1);
>
> + /*
> + * Do the secure computing after ptrace; failures should be fast.
> + * If this fails we might have return value in a0 from seccomp
> + * (via SECCOMP_RET_ERRNO/TRACE).
> + */
> + if (secure_computing(NULL) == -1)
> + syscall_set_nr(current, regs, -1);

On a -1 return, this should return immediately -- it should not
continue to process trace_sys_enter(), etc.

-Kees

> +
> #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
> if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> trace_sys_enter(regs, syscall_get_nr(current, regs));
> --
> 2.19.2
>


--
Kees Cook