Re: [PATCH v5 net-next 2/3] [RFC] seccomp: convert seccomp to use extended BPF

From: Kees Cook
Date: Wed Mar 05 2014 - 16:42:54 EST


On Tue, Mar 4, 2014 at 7:11 PM, Alexei Starovoitov <ast@xxxxxxxxxxxx> wrote:
> On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <ast@xxxxxxxxxxxx> wrote:
>> use sk_convert_filter() to convert seccomp BPF into extended BPF
>>
>> 05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
>> seccomp_rule_add_exact(ctx,...
>> seccomp_rule_add_exact(ctx,...
>> rc = seccomp_load(ctx);
>> for (i = 0; i < 10000000; i++)
>> syscall(199, 100);
>>
>> --x86_64--
>> old BPF: 8.6 seconds
>> 73.38% bench [kernel.kallsyms] [k] sk_run_filter
>> 10.70% bench libc-2.15.so [.] syscall
>> 5.09% bench [kernel.kallsyms] [k] seccomp_bpf_load
>> 1.97% bench [kernel.kallsyms] [k] system_call
>> ext BPF: 5.7 seconds
>> 66.20% bench [kernel.kallsyms] [k] sk_run_filter_ext
>> 16.75% bench libc-2.15.so [.] syscall
>> 3.31% bench [kernel.kallsyms] [k] system_call
>> 2.88% bench [kernel.kallsyms] [k] __secure_computing
>>
>> --i386---
>> old BPF: 5.4 sec
>> ext BPF: 3.8 sec
>>
>> BPF filters generated by seccomp are very branchy, so ext BPF
>> performance is better than old BPF.
>>
>> Performance gains will be even higher when extended BPF JIT
>> is committed.

Very cool! Have you had a chance to compare on ARM too?

>> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxxxx>
>>
>> ---
>> This patch is an RFC to use extended BPF in seccomp.
>> Change it to do it conditionally with bpf_ext_enable knob ?
>
> Kees, Will,
>
> I've played with libseccomp to test this patch and just found
> your other testsuite for bpf+seccomp:
> https://github.com/redpig/seccomp
> It passes as well on x86_64 and i386.

Great! If it's passing those tests, then things should be in good shape.

> Not sure how real all 'seccomp' and libseccomp' bpf filters.
> Some of them are very short, some very long.
> If average number of filtered syscalls is > 10, then upcoming
> 'ebpf table' will give another performance boost, since single table
> lookup will be faster than long sequence of 'if' conditions.

To take advantage of that, will seccomp need a new (prctl) interface
to pass in a seccomp ebpf?

> Please review.
>
> Thanks
> Alexei
>
>> ---
>> include/linux/seccomp.h | 1 -
>> kernel/seccomp.c | 112 +++++++++++++++++++++--------------------------
>> net/core/filter.c | 5 ---
>> 3 files changed, 51 insertions(+), 67 deletions(-)
>>
>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> index 6f19cfd1840e..4054b0994071 100644
>> --- a/include/linux/seccomp.h
>> +++ b/include/linux/seccomp.h
>> @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
>> #ifdef CONFIG_SECCOMP_FILTER
>> extern void put_seccomp_filter(struct task_struct *tsk);
>> extern void get_seccomp_filter(struct task_struct *tsk);
>> -extern u32 seccomp_bpf_load(int off);
>> #else /* CONFIG_SECCOMP_FILTER */
>> static inline void put_seccomp_filter(struct task_struct *tsk)
>> {
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index b7a10048a32c..346c597b44cc 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -55,60 +55,27 @@ struct seccomp_filter {
>> atomic_t usage;
>> struct seccomp_filter *prev;
>> unsigned short len; /* Instruction count */
>> - struct sock_filter insns[];
>> + struct sock_filter_ext insns[];
>> };
>>
>> /* Limit any path through the tree to 256KB worth of instructions. */
>> #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
>>
>> -/**
>> - * get_u32 - returns a u32 offset into data
>> - * @data: a unsigned 64 bit value
>> - * @index: 0 or 1 to return the first or second 32-bits
>> - *
>> - * This inline exists to hide the length of unsigned long. If a 32-bit
>> - * unsigned long is passed in, it will be extended and the top 32-bits will be
>> - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
>> - * properly returned.
>> - *
>> +/*
>> * Endianness is explicitly ignored and left for BPF program authors to manage
>> * as per the specific architecture.
>> */
>> -static inline u32 get_u32(u64 data, int index)
>> -{
>> - return ((u32 *)&data)[index];
>> -}
>> -
>> -/* Helper for bpf_load below. */
>> -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
>> -/**
>> - * bpf_load: checks and returns a pointer to the requested offset
>> - * @off: offset into struct seccomp_data to load from
>> - *
>> - * Returns the requested 32-bits of data.
>> - * seccomp_check_filter() should assure that @off is 32-bit aligned
>> - * and not out of bounds. Failure to do so is a BUG.
>> - */
>> -u32 seccomp_bpf_load(int off)
>> +static void populate_seccomp_data(struct seccomp_data *sd)
>> {
>> struct pt_regs *regs = task_pt_regs(current);
>> - if (off == BPF_DATA(nr))
>> - return syscall_get_nr(current, regs);
>> - if (off == BPF_DATA(arch))
>> - return syscall_get_arch(current, regs);
>> - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
>> - unsigned long value;
>> - int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
>> - int index = !!(off % sizeof(u64));
>> - syscall_get_arguments(current, regs, arg, 1, &value);
>> - return get_u32(value, index);
>> - }
>> - if (off == BPF_DATA(instruction_pointer))
>> - return get_u32(KSTK_EIP(current), 0);
>> - if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
>> - return get_u32(KSTK_EIP(current), 1);
>> - /* seccomp_check_filter should make this impossible. */
>> - BUG();
>> + int i;
>> +
>> + sd->nr = syscall_get_nr(current, regs);
>> + sd->arch = syscall_get_arch(current, regs);
>> + for (i = 0; i < 6; i++)
>> + syscall_get_arguments(current, regs, i, 1,
>> + (unsigned long *)&sd->args[i]);
>> + sd->instruction_pointer = KSTK_EIP(current);
>> }
>>
>> /**
>> @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>>
>> switch (code) {
>> case BPF_S_LD_W_ABS:
>> - ftest->code = BPF_S_ANC_SECCOMP_LD_W;
>> + ftest->code = BPF_LDX | BPF_W | BPF_ABS;

So, with this change, the removal of BPF_S_ANC_SECCOMP_LD_W, and the
unconditional use of populate_seccomp_data(), I'm surprised there
isn't a larger hit on small filters. Currently, seccomp only loads
what it needs into the filter (via BPF_S_ANC_SECCOMP_LD_W + offset via
seccomp_bpf_load). Your benchmarks seem to show that this isn't a
problem, though. Is the ebpf gain just that much larger than the
additional unconditional loads happening in populate_seccomp_data()?

>> /* 32-bit aligned and not out of bounds. */
>> if (k >= sizeof(struct seccomp_data) || k & 3)
>> return -EINVAL;
>> continue;
>> case BPF_S_LD_W_LEN:
>> - ftest->code = BPF_S_LD_IMM;
>> + ftest->code = BPF_LD | BPF_IMM;
>> ftest->k = sizeof(struct seccomp_data);
>> continue;
>> case BPF_S_LDX_W_LEN:
>> - ftest->code = BPF_S_LDX_IMM;
>> + ftest->code = BPF_LDX | BPF_IMM;
>> ftest->k = sizeof(struct seccomp_data);
>> continue;
>> /* Explicitly include allowed calls. */
>> @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>> case BPF_S_JMP_JGT_X:
>> case BPF_S_JMP_JSET_K:
>> case BPF_S_JMP_JSET_X:
>> + sk_decode_filter(ftest, ftest);
>> continue;
>> default:
>> return -EINVAL;
>> @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>> static u32 seccomp_run_filters(int syscall)
>> {
>> struct seccomp_filter *f;
>> + struct seccomp_data sd;
>> u32 ret = SECCOMP_RET_ALLOW;
>>
>> /* Ensure unexpected behavior doesn't result in failing open. */
>> if (WARN_ON(current->seccomp.filter == NULL))
>> return SECCOMP_RET_KILL;
>>
>> + populate_seccomp_data(&sd);
>> +
>> /*
>> * All filters in the list are evaluated and the lowest BPF return
>> * value always takes priority (ignoring the DATA).
>> */
>> for (f = current->seccomp.filter; f; f = f->prev) {
>> - u32 cur_ret = sk_run_filter(NULL, f->insns);
>> + u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
>> if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>> ret = cur_ret;
>> }
>> @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>> struct seccomp_filter *filter;
>> unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
>> unsigned long total_insns = fprog->len;
>> + struct sock_filter *fp;
>> + int new_len;
>> long ret;
>>
>> if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
>> @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>> CAP_SYS_ADMIN) != 0)
>> return -EACCES;
>>
>> - /* Allocate a new seccomp_filter */
>> - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
>> - GFP_KERNEL|__GFP_NOWARN);
>> - if (!filter)
>> + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
>> + if (!fp)
>> return -ENOMEM;
>> - atomic_set(&filter->usage, 1);
>> - filter->len = fprog->len;
>>
>> /* Copy the instructions from fprog. */
>> ret = -EFAULT;
>> - if (copy_from_user(filter->insns, fprog->filter, fp_size))
>> - goto fail;
>> + if (copy_from_user(fp, fprog->filter, fp_size))
>> + goto free_prog;
>>
>> /* Check and rewrite the fprog via the skb checker */
>> - ret = sk_chk_filter(filter->insns, filter->len);
>> + ret = sk_chk_filter(fp, fprog->len);
>> if (ret)
>> - goto fail;
>> + goto free_prog;
>>
>> /* Check and rewrite the fprog for seccomp use */
>> - ret = seccomp_check_filter(filter->insns, filter->len);
>> + ret = seccomp_check_filter(fp, fprog->len);
>> if (ret)
>> - goto fail;
>> + goto free_prog;
>> +
>> + /* convert 'sock_filter' insns to 'sock_filter_ext' insns */
>> + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
>> + if (ret)
>> + goto free_prog;
>> +
>> + /* Allocate a new seccomp_filter */
>> + filter = kzalloc(sizeof(struct seccomp_filter) +
>> + sizeof(struct sock_filter_ext) * new_len,
>> + GFP_KERNEL|__GFP_NOWARN);
>> + if (!filter)
>> + goto free_prog;
>> +
>> + ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);

Seems like it'd be more readable to set filter->len = new_len first
and use &filter->len as the last argument here. I would find that more
readable, but if nothing else needs changing in the series, this is
fine to leave as-is.

>> + if (ret)
>> + goto free_filter;
>> + atomic_set(&filter->usage, 1);
>> + filter->len = new_len;
>>
>> /*
>> * If there is an existing filter, make it the prev and don't drop its
>> @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>> filter->prev = current->seccomp.filter;
>> current->seccomp.filter = filter;
>> return 0;
>> -fail:
>> +
>> +free_filter:
>> kfree(filter);
>> +free_prog:
>> + kfree(fp);
>> return ret;
>> }
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 1494421486b7..f144a6a7d939 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -388,11 +388,6 @@ load_b:
>> A = 0;
>> continue;
>> }
>> -#ifdef CONFIG_SECCOMP_FILTER
>> - case BPF_S_ANC_SECCOMP_LD_W:
>> - A = seccomp_bpf_load(fentry->k);
>> - continue;
>> -#endif
>> default:
>> WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
>> fentry->code, fentry->jt,
>> --
>> 1.7.9.5
>>

Cool!

Reviewed-by: Kees Cook <keescook@xxxxxxxxxxxx>

-Kees

--
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/