[RFC PATCH tip 2/5] Extended BPF JIT for x86-64

From: Alexei Starovoitov
Date: Mon Dec 02 2013 - 23:30:09 EST


Just-In-Time compiler that maps 64-bit BPF instructions to x86-64 instructions.

Most BPF instructions have one to one mapping.

Every BPF register maps to one x86-64 register:
R0 -> rax
R1 -> rdi
R2 -> rsi
R3 -> rdx
R4 -> rcx
R5 -> r8
R6 -> rbx
R7 -> r13
R8 -> r14
R9 -> r15
FP -> rbp

BPF calling convention is defined as:
R0 - return value from in-kernel function
R1-R5 - arguments from BPF program to in-kernel function
R6-R9 - callee saved registers that in-kernel function will preserve
R10 - read-only frame pointer to access stack
so BPF calling convention maps directly to x86-64 calling convention.

Allowing zero-overhead calls between BPF filter and safe kernel functions

Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxxxx>
---
arch/x86/Kconfig | 1 +
arch/x86/net/Makefile | 1 +
arch/x86/net/bpf64_jit_comp.c | 625 +++++++++++++++++++++++++++++++++++++++++
arch/x86/net/bpf_jit_comp.c | 23 +-
arch/x86/net/bpf_jit_comp.h | 35 +++
5 files changed, 665 insertions(+), 20 deletions(-)
create mode 100644 arch/x86/net/bpf64_jit_comp.c
create mode 100644 arch/x86/net/bpf_jit_comp.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c84cf90..44b0b11 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -92,6 +92,7 @@ config X86
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
+ select HAVE_BPF64_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c3..c3bb7d5 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -2,3 +2,4 @@
# Arch-specific network modules
#
obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+obj-$(CONFIG_BPF64_JIT) += bpf64_jit_comp.o
diff --git a/arch/x86/net/bpf64_jit_comp.c b/arch/x86/net/bpf64_jit_comp.c
new file mode 100644
index 0000000..5f7c331
--- /dev/null
+++ b/arch/x86/net/bpf64_jit_comp.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2011-2013 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf_jit.h>
+#include <linux/moduleloader.h>
+#include "bpf_jit_comp.h"
+
+static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else
+ *(u32 *)ptr = bytes;
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) (prog = emit_code(prog, (bytes), (len)))
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + \
+ ((b4) << 24), 4)
+/* imm32 is sign extended by cpu */
+#define EMIT1_off32(b1, off) \
+ do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+/* mov A, X */
+#define EMIT_mov(A, X) \
+ EMIT3(add_2mod(0x48, A, X), 0x89, add_2reg(0xC0, A, X))
+
+#define X86_JAE 0x73
+#define X86_JE 0x74
+#define X86_JNE 0x75
+#define X86_JA 0x77
+#define X86_JGE 0x7D
+#define X86_JG 0x7F
+
+static inline bool is_imm8(__s32 value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static inline bool is_simm32(__s64 value)
+{
+ return value == (__s64)(__s32)value;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
+
+#define AUX_REG 32
+
+/* avoid x86-64 R12 which if used as base address in memory access
+ * always needs an extra byte for index */
+static const int reg2hex[] = {
+ [R0] = 0, /* rax */
+ [R1] = 7, /* rdi */
+ [R2] = 6, /* rsi */
+ [R3] = 2, /* rdx */
+ [R4] = 1, /* rcx */
+ [R5] = 0, /* r8 */
+ [R6] = 3, /* rbx callee saved */
+ [R7] = 5, /* r13 callee saved */
+ [R8] = 6, /* r14 callee saved */
+ [R9] = 7, /* r15 callee saved */
+ [__fp__] = 5, /* rbp readonly */
+ [AUX_REG] = 1, /* r9 temp register */
+};
+
+/* is_ereg() == true if r8 <= reg <= r15,
+ * rax,rcx,...,rbp don't need extra byte of encoding */
+static inline bool is_ereg(u32 reg)
+{
+ if (reg == R5 || (reg >= R7 && reg <= R9) || reg == AUX_REG)
+ return true;
+ else
+ return false;
+}
+
+static inline u8 add_1mod(u8 byte, u32 reg)
+{
+ if (is_ereg(reg))
+ byte |= 1;
+ return byte;
+}
+static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+ if (is_ereg(r1))
+ byte |= 1;
+ if (is_ereg(r2))
+ byte |= 4;
+ return byte;
+}
+
+static inline u8 add_1reg(u8 byte, u32 a_reg)
+{
+ return byte + reg2hex[a_reg];
+}
+static inline u8 add_2reg(u8 byte, u32 a_reg, u32 x_reg)
+{
+ return byte + reg2hex[a_reg] + (reg2hex[x_reg] << 3);
+}
+
+static u8 *select_bpf_func(struct bpf_program *prog, int id)
+{
+ if (id <= 0 || id >= prog->strtab_size)
+ return NULL;
+ return prog->cb->jit_select_func(prog->strtab, id);
+}
+
+static int do_jit(struct bpf_program *bpf_prog, int *addrs, u8 *image,
+ int oldproglen)
+{
+ struct bpf_insn *insn = bpf_prog->insns;
+ int insn_cnt = bpf_prog->insn_cnt;
+ u8 temp[64];
+ int i;
+ int proglen = 0;
+ u8 *prog = temp;
+ int stacksize = 512;
+
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+
+ /* sub rsp, stacksize */
+ EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+ /* mov qword ptr [rbp-X],rbx */
+ EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+ /* mov qword ptr [rbp-X],r13 */
+ EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+ /* mov qword ptr [rbp-X],r14 */
+ EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+ /* mov qword ptr [rbp-X],r15 */
+ EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const __s32 K = insn->imm;
+ __u32 a_reg = insn->a_reg;
+ __u32 x_reg = insn->x_reg;
+ u8 b1 = 0, b2 = 0, b3 = 0;
+ u8 jmp_cond;
+ __s64 jmp_offset;
+ int ilen;
+ u8 *func;
+
+ switch (insn->code) {
+ /* ALU */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ b1 = 0x48;
+ b3 = 0xC0;
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b2 = 0x01; break;
+ case BPF_SUB: b2 = 0x29; break;
+ case BPF_AND: b2 = 0x21; break;
+ case BPF_OR: b2 = 0x09; break;
+ case BPF_XOR: b2 = 0x31; break;
+ }
+ EMIT3(add_2mod(b1, a_reg, x_reg), b2,
+ add_2reg(b3, a_reg, x_reg));
+ break;
+
+ /* mov A, X */
+ case BPF_ALU | BPF_MOV | BPF_X:
+ EMIT_mov(a_reg, x_reg);
+ break;
+
+ /* neg A */
+ case BPF_ALU | BPF_NEG | BPF_X:
+ EMIT3(add_1mod(0x48, a_reg), 0xF7,
+ add_1reg(0xD8, a_reg));
+ break;
+
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ b1 = add_1mod(0x48, a_reg);
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b3 = 0xC0; break;
+ case BPF_SUB: b3 = 0xE8; break;
+ case BPF_AND: b3 = 0xE0; break;
+ case BPF_OR: b3 = 0xC8; break;
+ }
+
+ if (is_imm8(K))
+ EMIT4(b1, 0x83, add_1reg(b3, a_reg), K);
+ else
+ EMIT3_off32(b1, 0x81, add_1reg(b3, a_reg), K);
+ break;
+
+ case BPF_ALU | BPF_MOV | BPF_K:
+ /* 'mov rax, imm32' sign extends imm32.
+ * possible optimization: if imm32 is positive,
+ * use 'mov eax, imm32' (which zero-extends imm32)
+ * to save 2 bytes */
+ b1 = add_1mod(0x48, a_reg);
+ b2 = 0xC7;
+ b3 = 0xC0;
+ EMIT3_off32(b1, b2, add_1reg(b3, a_reg), K);
+ break;
+
+ /* A %= X
+ * A /= X */
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ /* mov r9, X */
+ EMIT_mov(AUX_REG, x_reg);
+
+ /* mov rax, A */
+ EMIT_mov(R0, a_reg);
+
+ /* xor rdx, rdx */
+ EMIT3(0x48, 0x31, 0xd2);
+
+ /* if X==0, skip divide, make A=0 */
+
+ /* cmp r9, 0 */
+ EMIT4(0x49, 0x83, 0xF9, 0x00);
+
+ /* je .+3 */
+ EMIT2(X86_JE, 3);
+
+ /* div r9 */
+ EMIT3(0x49, 0xF7, 0xF1);
+
+ if (BPF_OP(insn->code) == BPF_MOD) {
+ /* mov r9, rdx */
+ EMIT3(0x49, 0x89, 0xD1);
+ } else {
+ /* mov r9, rax */
+ EMIT3(0x49, 0x89, 0xC1);
+ }
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov A, r9 */
+ EMIT_mov(a_reg, AUX_REG);
+ break;
+
+ /* shifts */
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ b1 = add_1mod(0x48, a_reg);
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+ EMIT4(b1, 0xC1, add_1reg(b3, a_reg), K);
+ break;
+
+ case BPF_ALU | BPF_BSWAP32 | BPF_X:
+ /* emit 'bswap eax' to swap lower 4-bytes */
+ if (is_ereg(a_reg))
+ EMIT2(0x41, 0x0F);
+ else
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, a_reg));
+ break;
+
+ case BPF_ALU | BPF_BSWAP64 | BPF_X:
+ /* emit 'bswap rax' to swap 8-bytes */
+ EMIT3(add_1mod(0x48, a_reg), 0x0F,
+ add_1reg(0xC8, a_reg));
+ break;
+
+ /* ST: *(u8*)(a_reg + off) = imm */
+ case BPF_ST | BPF_REL | BPF_B:
+ if (is_ereg(a_reg))
+ EMIT2(0x41, 0xC6);
+ else
+ EMIT1(0xC6);
+ goto st;
+ case BPF_ST | BPF_REL | BPF_H:
+ if (is_ereg(a_reg))
+ EMIT3(0x66, 0x41, 0xC7);
+ else
+ EMIT2(0x66, 0xC7);
+ goto st;
+ case BPF_ST | BPF_REL | BPF_W:
+ if (is_ereg(a_reg))
+ EMIT2(0x41, 0xC7);
+ else
+ EMIT1(0xC7);
+ goto st;
+ case BPF_ST | BPF_REL | BPF_DW:
+ EMIT2(add_1mod(0x48, a_reg), 0xC7);
+
+st: if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, a_reg), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, a_reg), insn->off);
+
+ EMIT(K, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+ break;
+
+ /* STX: *(u8*)(a_reg + off) = x_reg */
+ case BPF_STX | BPF_REL | BPF_B:
+ /* emit 'mov byte ptr [rax + off], al' */
+ if (is_ereg(a_reg) || is_ereg(x_reg) ||
+ /* have to add extra byte for x86 SIL, DIL regs */
+ x_reg == R1 || x_reg == R2)
+ EMIT2(add_2mod(0x40, a_reg, x_reg), 0x88);
+ else
+ EMIT1(0x88);
+ goto stx;
+ case BPF_STX | BPF_REL | BPF_H:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT3(0x66, add_2mod(0x40, a_reg, x_reg), 0x89);
+ else
+ EMIT2(0x66, 0x89);
+ goto stx;
+ case BPF_STX | BPF_REL | BPF_W:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT2(add_2mod(0x40, a_reg, x_reg), 0x89);
+ else
+ EMIT1(0x89);
+ goto stx;
+ case BPF_STX | BPF_REL | BPF_DW:
+ EMIT2(add_2mod(0x48, a_reg, x_reg), 0x89);
+stx: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+ insn->off);
+ break;
+
+ /* LDX: a_reg = *(u8*)(x_reg + off) */
+ case BPF_LDX | BPF_REL | BPF_B:
+ /* emit 'movzx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB6);
+ goto ldx;
+ case BPF_LDX | BPF_REL | BPF_H:
+ /* emit 'movzx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB7);
+ goto ldx;
+ case BPF_LDX | BPF_REL | BPF_W:
+ /* emit 'mov eax, dword ptr [rax+0x14]' */
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT2(add_2mod(0x40, x_reg, a_reg), 0x8B);
+ else
+ EMIT1(0x8B);
+ goto ldx;
+ case BPF_LDX | BPF_REL | BPF_DW:
+ /* emit 'mov rax, qword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, x_reg, a_reg), 0x8B);
+ldx: /* if insn->off == 0 we can save one extra byte, but
+ * special case of x86 R13 which always needs an offset
+ * is not worth the pain */
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, x_reg, a_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, x_reg, a_reg),
+ insn->off);
+ break;
+
+ /* STX XADD: lock *(u8*)(a_reg + off) += x_reg */
+ case BPF_STX | BPF_XADD | BPF_B:
+ /* emit 'lock add byte ptr [rax + off], al' */
+ if (is_ereg(a_reg) || is_ereg(x_reg) ||
+ /* have to add extra byte for x86 SIL, DIL regs */
+ x_reg == R1 || x_reg == R2)
+ EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x00);
+ else
+ EMIT2(0xF0, 0x00);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_H:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT4(0x66, 0xF0, add_2mod(0x40, a_reg, x_reg),
+ 0x01);
+ else
+ EMIT3(0x66, 0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_W:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x01);
+ else
+ EMIT2(0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_DW:
+ EMIT3(0xF0, add_2mod(0x48, a_reg, x_reg), 0x01);
+xadd: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+ insn->off);
+ break;
+
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ func = select_bpf_func(bpf_prog, K);
+ jmp_offset = func - (image + addrs[i]);
+ if (!func || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ K, func, image);
+ return -EINVAL;
+ }
+ EMIT1_off32(0xE8, jmp_offset);
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ /* emit 'cmp a_reg, x_reg' insn */
+ b1 = 0x48;
+ b2 = 0x39;
+ b3 = 0xC0;
+ EMIT3(add_2mod(b1, a_reg, x_reg), b2,
+ add_2reg(b3, a_reg, x_reg));
+ goto emit_jump;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ /* emit 'cmp a_reg, imm8/32' */
+ EMIT1(add_1mod(0x48, a_reg));
+
+ if (is_imm8(K))
+ EMIT3(0x83, add_1reg(0xF8, a_reg), K);
+ else
+ EMIT2_off32(0x81, add_1reg(0xF8, a_reg), K);
+
+emit_jump: /* convert BPF opcode to x86 */
+ switch (BPF_OP(insn->code)) {
+ case BPF_JEQ:
+ jmp_cond = X86_JE;
+ break;
+ case BPF_JNE:
+ jmp_cond = X86_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = X86_JA;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = X86_JAE;
+ break;
+ case BPF_JSGT:
+ /* signed '>', GT in x86 */
+ jmp_cond = X86_JG;
+ break;
+ case BPF_JSGE:
+ /* signed '>=', GE in x86 */
+ jmp_cond = X86_JGE;
+ break;
+ default: /* to silence gcc warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+
+ case BPF_JMP | BPF_JA | BPF_X:
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+
+ case BPF_RET | BPF_K:
+ /* mov rbx, qword ptr [rbp-X] */
+ EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+ /* mov r13, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+ /* mov r14, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+ /* mov r15, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ break;
+
+ default:
+ /*pr_debug_bpf_insn(insn, NULL);*/
+ pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
+ return -EINVAL;
+ }
+
+ ilen = prog - temp;
+ if (image) {
+ if (proglen + ilen > oldproglen)
+ return -2;
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+void bpf_compile(struct bpf_program *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ int proglen, oldproglen = 0;
+ int *addrs;
+ u8 *image = NULL;
+ int pass;
+ int i;
+
+ if (!prog || !prog->cb || !prog->cb->jit_select_func)
+ return;
+
+ addrs = kmalloc(prog->insn_cnt * sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return;
+
+ for (proglen = 0, i = 0; i < prog->insn_cnt; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ for (pass = 0; pass < 10; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen);
+ if (proglen <= 0) {
+ image = NULL;
+ goto out;
+ }
+ if (image) {
+ if (proglen != oldproglen)
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
+ break;
+ }
+ if (proglen == oldproglen) {
+ header = bpf_alloc_binary(proglen, &image);
+ if (!header)
+ goto out;
+ }
+ oldproglen = proglen;
+ }
+
+ if (image) {
+ bpf_flush_icache(header, image + proglen);
+ set_memory_ro((unsigned long)header, header->pages);
+ }
+out:
+ kfree(addrs);
+ prog->jit_image = (void (*)(struct bpf_context *ctx))image;
+ return;
+}
+
+static void bpf_jit_free_deferred(struct work_struct *work)
+{
+ struct bpf_program *prog = container_of(work, struct bpf_program, work);
+ unsigned long addr = (unsigned long)prog->jit_image & PAGE_MASK;
+ struct bpf_binary_header *header = (void *)addr;
+
+ set_memory_rw(addr, header->pages);
+ module_free(NULL, header);
+ free_bpf_program(prog);
+}
+
+void __bpf_free(struct bpf_program *prog)
+{
+ if (prog->jit_image) {
+ INIT_WORK(&prog->work, bpf_jit_free_deferred);
+ schedule_work(&prog->work);
+ } else {
+ free_bpf_program(prog);
+ }
+}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26328e8..3c35f8d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -13,6 +13,7 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <linux/random.h>
+#include "bpf_jit_comp.h"

/*
* Conventions :
@@ -112,16 +113,6 @@ do { \
#define SEEN_XREG 2 /* ebx is used */
#define SEEN_MEM 4 /* use mem[] for temporary storage */

-static inline void bpf_flush_icache(void *start, void *end)
-{
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- smp_wmb();
- flush_icache_range((unsigned long)start, (unsigned long)end);
- set_fs(old_fs);
-}
-
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)

@@ -145,16 +136,8 @@ static int pkt_type_offset(void)
return -1;
}

-struct bpf_binary_header {
- unsigned int pages;
- /* Note : for security reasons, bpf code will follow a randomly
- * sized amount of int3 instructions
- */
- u8 image[];
-};
-
-static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
- u8 **image_ptr)
+struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+ u8 **image_ptr)
{
unsigned int sz, hole;
struct bpf_binary_header *header;
diff --git a/arch/x86/net/bpf_jit_comp.h b/arch/x86/net/bpf_jit_comp.h
new file mode 100644
index 0000000..74ff45d
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.h
@@ -0,0 +1,35 @@
+/* bpf_jit_comp.h : BPF filter alloc/free routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef __BPF_JIT_COMP_H
+#define __BPF_JIT_COMP_H
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+struct bpf_binary_header {
+ unsigned int pages;
+ /* Note : for security reasons, bpf code will follow a randomly
+ * sized amount of int3 instructions
+ */
+ u8 image[];
+};
+
+static inline void bpf_flush_icache(void *start, void *end)
+{
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ smp_wmb();
+ flush_icache_range((unsigned long)start, (unsigned long)end);
+ set_fs(old_fs);
+}
+
+struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+ u8 **image_ptr);
+
+#endif
--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/