[PATCH net-next 3/3] tap: XDP support

From: Jason Wang
Date: Thu Jul 27 2017 - 05:26:05 EST


This patch tries to implement XDP for tun. The implementation was
split into two parts:

- fast path: small and no gso packet. We try to do XDP at page level
before build_skb(). For XDP_TX, since creating/destroying queues
were completely under control of userspace, it was implemented
through generic XDP helper after skb has been built. This could be
optimized in the future.
- slow path: big or gso packet. We try to do it after skb was created
through generic XDP helpers.

XDP_REDIRECT was not implemented, it could be done on top.

xdp1 test shows 47.6% improvement:

Before: ~2.1Mpps
After: ~3.1Mpps

Suggested-by: Michael S. Tsirkin <mst@xxxxxxxxxx>
Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
---
drivers/net/tun.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 142 insertions(+), 7 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index de1a2ef..13abe23 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -73,6 +73,8 @@
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>

#include <linux/uaccess.h>

@@ -105,7 +107,8 @@ do { \
} while (0)
#endif

-#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_HEADROOM 256
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)

/* TUN device flags */

@@ -224,6 +227,7 @@ struct tun_struct {
u32 flow_count;
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
+ struct bpf_prog __rcu *xdp_prog;
};

#ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -590,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
static void tun_detach_all(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
struct tun_file *tfile, *tmp;
int i, n = tun->numqueues;

@@ -622,6 +627,9 @@ static void tun_detach_all(struct net_device *dev)
}
BUG_ON(tun->numdisabled != 0);

+ if (xdp_prog)
+ bpf_prog_put(xdp_prog);
+
if (tun->flags & IFF_PERSIST)
module_put(THIS_MODULE);
}
@@ -1008,6 +1016,56 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_dropped = tx_dropped;
}

+static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *old_prog;
+
+ /* We will shift the packet that can't be handled to generic
+ * XDP layer.
+ */
+
+ old_prog = rtnl_dereference(tun->xdp_prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ rcu_assign_pointer(tun->xdp_prog, prog);
+
+ if (prog) {
+ prog = bpf_prog_add(prog, 1);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+
+ return 0;
+}
+
+static u32 tun_xdp_query(struct net_device *dev)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ const struct bpf_prog *xdp_prog;
+
+ xdp_prog = rtnl_dereference(tun->xdp_prog);
+ if (xdp_prog)
+ return xdp_prog->aux->id;
+
+ return 0;
+}
+
+static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return tun_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_QUERY_PROG:
+ xdp->prog_id = tun_xdp_query(dev);
+ xdp->prog_attached = !!xdp->prog_id;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
@@ -1038,6 +1096,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_xdp = tun_xdp,
};

static void tun_flow_init(struct tun_struct *tun)
@@ -1217,16 +1276,21 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}

-static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+static struct sk_buff *tun_build_skb(struct tun_struct *tun,
+ struct tun_file *tfile,
struct iov_iter *from,
- int len)
+ struct virtio_net_hdr *hdr,
+ int len, int *generic_xdp)
{
struct page_frag *alloc_frag = &tfile->alloc_frag;
struct sk_buff *skb;
+ struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ unsigned int delta = 0;
char *buf;
size_t copied;
+ bool xdp_xmit = false;

if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
return ERR_PTR(-ENOMEM);
@@ -1238,16 +1302,68 @@ static struct sk_buff *tun_build_skb(struct tun_file *tfile,
if (copied != len)
return ERR_PTR(-EFAULT);

+ if (hdr->gso_type)
+ *generic_xdp = 1;
+ else
+ *generic_xdp = 0;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog && !*generic_xdp) {
+ struct xdp_buff xdp;
+ void *orig_data;
+ u32 act;
+
+ xdp.data_hard_start = buf;
+ xdp.data = buf + TUN_RX_PAD;
+ xdp.data_end = xdp.data + len;
+ orig_data = xdp.data;
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+ switch (act) {
+ case XDP_TX:
+ xdp_xmit = true;
+ /* fall through */
+ case XDP_PASS:
+ delta = orig_data - xdp.data;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ goto err_xdp;
+ }
+ }
+
skb = build_skb(buf, buflen);
- if (!skb)
+ if (!skb) {
+ rcu_read_unlock();
return ERR_PTR(-ENOMEM);
+ }

- skb_reserve(skb, TUN_RX_PAD);
- skb_put(skb, len);
+ skb_reserve(skb, TUN_RX_PAD - delta);
+ skb_put(skb, len + delta);
get_page(alloc_frag->page);
alloc_frag->offset += buflen;

+ if (xdp_xmit) {
+ skb->dev = tun->dev;
+ generic_xdp_tx(skb, xdp_prog);
+ rcu_read_lock();
+ return NULL;
+ }
+
+ rcu_read_unlock();
+
return skb;
+
+err_xdp:
+ rcu_read_unlock();
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return NULL;
}

/* Get packet from user space buffer */
@@ -1266,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
bool zerocopy = false;
int err;
u32 rxhash;
+ int generic_xdp = 1;

if (!(tun->dev->flags & IFF_UP))
return -EIO;
@@ -1324,11 +1441,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
}

if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
- skb = tun_build_skb(tfile, from, len);
+ skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
+ if (!skb)
+ return total_len;
} else {
if (!zerocopy) {
copylen = len;
@@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);

+ if (generic_xdp) {
+ struct bpf_prog *xdp_prog;
+ int ret;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ ret = do_xdp_generic(xdp_prog, skb);
+ if (ret != XDP_PASS) {
+ rcu_read_unlock();
+ return total_len;
+ }
+ }
+ rcu_read_unlock();
+ }
+
rxhash = __skb_get_hash_symmetric(skb);
#ifndef CONFIG_4KSTACKS
tun_rx_batched(tun, tfile, skb, more);
--
2.7.4