Re: [PATCH v2 net-next 2/4] net: add skeleton of bpfilter kernel module

From: Edward Cree
Date: Thu May 03 2018 - 10:24:15 EST


On 03/05/18 05:36, Alexei Starovoitov wrote:
> bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> and user mode helper code that is embedded into bpfilter.ko
>
> The steps to build bpfilter.ko are the following:
> - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
> is converted into bpfilter_umh.o object file
> with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
> Example:
> $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
> 0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
> 0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
> 0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
>
> bpfilter_kern.c is a normal kernel module code that calls
> the fork_usermode_blob() helper to execute part of its own data
> as a user mode process.
>
> Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> is placed into .init.rodata section, so it's freed as soon as __init
> function of bpfilter.ko is finished.
> As part of __init the bpfilter.ko does first request/reply action
> via two unix pipe provided by fork_usermode_blob() helper to
> make sure that umh is healthy. If not it will kill it via pid.
>
> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
>
> If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> kill umh as well.
>
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
> ---
> include/linux/bpfilter.h | 15 +++++++
> include/uapi/linux/bpfilter.h | 21 ++++++++++
> net/Kconfig | 2 +
> net/Makefile | 1 +
> net/bpfilter/Kconfig | 17 ++++++++
> net/bpfilter/Makefile | 24 +++++++++++
> net/bpfilter/bpfilter_kern.c | 93 +++++++++++++++++++++++++++++++++++++++++++
> net/bpfilter/main.c | 63 +++++++++++++++++++++++++++++
> net/bpfilter/msgfmt.h | 17 ++++++++
> net/ipv4/Makefile | 2 +
> net/ipv4/bpfilter/Makefile | 2 +
> net/ipv4/bpfilter/sockopt.c | 42 +++++++++++++++++++
> net/ipv4/ip_sockglue.c | 17 ++++++++
> 13 files changed, 316 insertions(+)
> create mode 100644 include/linux/bpfilter.h
> create mode 100644 include/uapi/linux/bpfilter.h
> create mode 100644 net/bpfilter/Kconfig
> create mode 100644 net/bpfilter/Makefile
> create mode 100644 net/bpfilter/bpfilter_kern.c
> create mode 100644 net/bpfilter/main.c
> create mode 100644 net/bpfilter/msgfmt.h
> create mode 100644 net/ipv4/bpfilter/Makefile
> create mode 100644 net/ipv4/bpfilter/sockopt.c
>
> diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
> new file mode 100644
> index 000000000000..687b1760bb9f
> --- /dev/null
> +++ b/include/linux/bpfilter.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_BPFILTER_H
> +#define _LINUX_BPFILTER_H
> +
> +#include <uapi/linux/bpfilter.h>
> +
> +struct sock;
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
> + unsigned int optlen);
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
> + int *optlen);
> +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> + char __user *optval,
> + unsigned int optlen, bool is_set);
> +#endif
> diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
> new file mode 100644
> index 000000000000..2ec3cc99ea4c
> --- /dev/null
> +++ b/include/uapi/linux/bpfilter.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_LINUX_BPFILTER_H
> +#define _UAPI_LINUX_BPFILTER_H
> +
> +#include <linux/if.h>
> +
> +enum {
> + BPFILTER_IPT_SO_SET_REPLACE = 64,
> + BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
> + BPFILTER_IPT_SET_MAX,
> +};
> +
> +enum {
> + BPFILTER_IPT_SO_GET_INFO = 64,
> + BPFILTER_IPT_SO_GET_ENTRIES = 65,
> + BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
> + BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
> + BPFILTER_IPT_GET_MAX,
> +};
> +
> +#endif /* _UAPI_LINUX_BPFILTER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index b62089fb1332..ed6368b306fa 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
>
> endif
>
> +source "net/bpfilter/Kconfig"
> +
> source "net/dccp/Kconfig"
> source "net/sctp/Kconfig"
> source "net/rds/Kconfig"
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..7f982b7682bd 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
> obj-$(CONFIG_XFRM) += xfrm/
> obj-$(CONFIG_UNIX) += unix/
> obj-$(CONFIG_NET) += ipv6/
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> obj-$(CONFIG_PACKET) += packet/
> obj-$(CONFIG_NET_KEY) += key/
> obj-$(CONFIG_BRIDGE) += bridge/
> diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
> new file mode 100644
> index 000000000000..782a732b9a5c
> --- /dev/null
> +++ b/net/bpfilter/Kconfig
> @@ -0,0 +1,17 @@
> +menuconfig BPFILTER
> + bool "BPF based packet filtering framework (BPFILTER)"
> + default n
> + depends on NET && BPF
> + help
> + This builds experimental bpfilter framework that is aiming to
> + provide netfilter compatible functionality via BPF
> +
> +if BPFILTER
> +config BPFILTER_UMH
> + tristate "bpftiler kernel module with user mode helper"
sp. "bpftiler" -> "bpfilter"
> + default m
> + depends on m
> + help
> + This builds bpfilter kernel module with embedded user mode helper
> +endif
> +
> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
> new file mode 100644
> index 000000000000..897eedae523e
> --- /dev/null
> +++ b/net/bpfilter/Makefile
> @@ -0,0 +1,24 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Makefile for the Linux BPFILTER layer.
> +#
> +
> +hostprogs-y := bpfilter_umh
> +bpfilter_umh-objs := main.o
> +HOSTCFLAGS += -I. -Itools/include/
> +
> +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> +# inside bpfilter_umh.o elf file referenced by
> +# _binary_net_bpfilter_bpfilter_umh_start symbol
> +# which bpfilter_kern.c passes further into umh blob loader at run-time
> +quiet_cmd_copy_umh = GEN $@
> + cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> + $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> + -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> + --rename-section .data=.init.rodata $< $@
> +
> +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
> + $(call cmd,copy_umh)
> +
> +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
> +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> new file mode 100644
> index 000000000000..e0a6fdd5842b
> --- /dev/null
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/umh.h>
> +#include <linux/bpfilter.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include "msgfmt.h"
> +
> +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
> +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
> +
> +extern char UMH_start;
> +extern char UMH_end;
> +
> +static struct umh_info info;
> +
> +static void shutdown_umh(struct umh_info *info)
> +{
> + struct task_struct *tsk;
> +
> + tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
> + if (tsk)
> + force_sig(SIGKILL, tsk);
> + fput(info->pipe_to_umh);
> + fput(info->pipe_from_umh);
> +}
> +
> +static void stop_umh(void)
> +{
> + if (bpfilter_process_sockopt) {
I worry about locking here. Is it possible for two calls to
Âbpfilter_process_sockopt() to run in parallel, both fail, and thus both
Âcall stop_umh()? And if both end up calling shutdown_umh(), we double
Âfput().
> + bpfilter_process_sockopt = NULL;
> + shutdown_umh(&info);
> + }
> +}
> +
> +static int __bpfilter_process_sockopt(struct sock *sk, int optname,
> + char __user *optval,
> + unsigned int optlen, bool is_set)
> +{
> + struct mbox_request req;
> + struct mbox_reply reply;
> + loff_t pos;
> + ssize_t n;
> +
> + req.is_set = is_set;
> + req.pid = current->pid;
> + req.cmd = optname;
> + req.addr = (long)optval;
> + req.len = optlen;
> + n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
> + if (n != sizeof(req)) {
> + pr_err("write fail %zd\n", n);
> + stop_umh();
> + return -EFAULT;
> + }
> + pos = 0;
> + n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
> + if (n != sizeof(reply)) {
> + pr_err("read fail %zd\n", n);
> + stop_umh();
> + return -EFAULT;
> + }
> + return reply.status;
> +}
> +
> +static int __init load_umh(void)
> +{
> + int err;
> +
> + err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
> + if (err)
> + return err;
> + pr_info("Loaded umh pid %d\n", info.pid);
> + bpfilter_process_sockopt = &__bpfilter_process_sockopt;
> +
> + if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
> + stop_umh();
> + return -EFAULT;
> + }
> + return 0;
> +}
> +
> +static void __exit fini_umh(void)
> +{
> + stop_umh();
> +}
> +module_init(load_umh);
> +module_exit(fini_umh);
> +MODULE_LICENSE("GPL");
> diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
> new file mode 100644
> index 000000000000..81bbc1684896
> --- /dev/null
> +++ b/net/bpfilter/main.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <sys/uio.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <sys/socket.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include "include/uapi/linux/bpf.h"
> +#include <asm/unistd.h>
> +#include "msgfmt.h"
> +
> +int debug_fd;
> +
> +static int handle_get_cmd(struct mbox_request *cmd)
> +{
> + switch (cmd->cmd) {
> + case 0:
> + return 0;
> + default:
> + break;
> + }
> + return -ENOPROTOOPT;
> +}
> +
> +static int handle_set_cmd(struct mbox_request *cmd)
> +{
> + return -ENOPROTOOPT;
> +}
> +
> +static void loop(void)
> +{
> + while (1) {
> + struct mbox_request req;
> + struct mbox_reply reply;
> + int n;
> +
> + n = read(0, &req, sizeof(req));
> + if (n != sizeof(req)) {
> + dprintf(debug_fd, "invalid request %d\n", n);
> + return;
> + }
> +
> + reply.status = req.is_set ?
> + handle_set_cmd(&req) :
> + handle_get_cmd(&req);
> +
> + n = write(1, &reply, sizeof(reply));
> + if (n != sizeof(reply)) {
> + dprintf(debug_fd, "reply failed %d\n", n);
> + return;
> + }
> + }
> +}
> +
> +int main(void)
> +{
> + debug_fd = open("/dev/console", 00000002 | 00000100);
Should probably handle failure of this open() call.
> + dprintf(debug_fd, "Started bpfilter\n");
> + loop();
> + close(debug_fd);
> + return 0;
> +}
> diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
> new file mode 100644
> index 000000000000..94b9ac9e5114
> --- /dev/null
> +++ b/net/bpfilter/msgfmt.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_BPFILTER_MSGFMT_H
> +#define _NET_BPFTILER_MSGFMT_H
Another bpftiler here, should be
+#define _NET_BPFILTER_MSGFMT_H

-Ed
> +
> +struct mbox_request {
> + __u64 addr;
> + __u32 len;
> + __u32 is_set;
> + __u32 cmd;
> + __u32 pid;
> +};
> +
> +struct mbox_reply {
> + __u32 status;
> +};
> +
> +#endif
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index b379520f9133..7018f91c5a39 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -16,6 +16,8 @@ obj-y := route.o inetpeer.o protocol.o \
> inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
> metrics.o
>
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> +
> obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
> obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
> obj-$(CONFIG_PROC_FS) += proc.o
> diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
> new file mode 100644
> index 000000000000..ce262d76cc48
> --- /dev/null
> +++ b/net/ipv4/bpfilter/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_BPFILTER) += sockopt.o
> +
> diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
> new file mode 100644
> index 000000000000..42a96d2d8d05
> --- /dev/null
> +++ b/net/ipv4/bpfilter/sockopt.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/uaccess.h>
> +#include <linux/bpfilter.h>
> +#include <uapi/linux/bpf.h>
> +#include <linux/wait.h>
> +#include <linux/kmod.h>
> +
> +int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> + char __user *optval,
> + unsigned int optlen, bool is_set);
> +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
> +
> +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
> + unsigned int optlen, bool is_set)
> +{
> + if (!bpfilter_process_sockopt) {
> + int err = request_module("bpfilter");
> +
> + if (err)
> + return err;
> + if (!bpfilter_process_sockopt)
> + return -ECHILD;
> + }
> + return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
> +}
> +
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
> + unsigned int optlen)
> +{
> + return bpfilter_mbox_request(sk, optname, optval, optlen, true);
> +}
> +
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
> + int __user *optlen)
> +{
> + int len;
> +
> + if (get_user(len, optlen))
> + return -EFAULT;
> +
> + return bpfilter_mbox_request(sk, optname, optval, len, false);
> +}
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 5ad2d8ed3a3f..e0791faacb24 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -47,6 +47,8 @@
> #include <linux/errqueue.h>
> #include <linux/uaccess.h>
>
> +#include <linux/bpfilter.h>
> +
> /*
> * SOL_IP control messages.
> */
> @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
> return -ENOPROTOOPT;
>
> err = do_ip_setsockopt(sk, level, optname, optval, optlen);
> +#ifdef CONFIG_BPFILTER
> + if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
> + optname < BPFILTER_IPT_SET_MAX)
> + err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
> +#endif
> #ifdef CONFIG_NETFILTER
> /* we need to exclude all possible ENOPROTOOPTs except default case */
> if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
> @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
> int err;
>
> err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
> +#ifdef CONFIG_BPFILTER
> + if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> + optname < BPFILTER_IPT_GET_MAX)
> + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
> #ifdef CONFIG_NETFILTER
> /* we need to exclude all possible ENOPROTOOPTs except default case */
> if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
> err = do_ip_getsockopt(sk, level, optname, optval, optlen,
> MSG_CMSG_COMPAT);
>
> +#ifdef CONFIG_BPFILTER
> + if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> + optname < BPFILTER_IPT_GET_MAX)
> + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
> #ifdef CONFIG_NETFILTER
> /* we need to exclude all possible ENOPROTOOPTs except default case */
> if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&