[PATCH net-next V2] tun: introduce tx skb ring

From: Jason Wang
Date: Wed Jun 15 2016 - 04:38:30 EST


We used to queue tx packets in sk_receive_queue, this is less
efficient since it requires spinlocks to synchronize between producer
and consumer.

This patch tries to address this by:

- introduce a new mode which will be only enabled with IFF_TX_ARRAY
set and switch from sk_receive_queue to a fixed size of skb
array with 256 entries in this mode.
- introduce a new proto_ops peek_len which was used for peeking the
skb length.
- implement a tun version of peek_len for vhost_net to use and convert
vhost_net to use peek_len if possible.

Pktgen test shows about 18% improvement on guest receiving pps for small
buffers:

Before: ~1220000pps
After : ~1440000pps

The reason why I stick to new mode is because:

- though resize is supported by skb array, in multiqueue mode, it's
not easy to recover from a partial success of queue resizing.
- tx_queue_len is a user visible feature.

Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
---
- The patch is based on [PATCH v8 0/5] skb_array: array based FIFO for skbs

Changes from V1:
- switch to use skb array instead of a customized circular buffer
- add non-blocking support
- rename .peek to .peek_len
- drop lockless peeking since test show very minor improvement
---
drivers/net/tun.c | 138 ++++++++++++++++++++++++++++++++++++++++----
drivers/vhost/net.c | 16 ++++-
include/linux/net.h | 1 +
include/uapi/linux/if_tun.h | 1 +
4 files changed, 143 insertions(+), 13 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e16487c..b22e475 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
#include <net/sock.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
+#include <linux/skb_array.h>

#include <asm/uaccess.h>

@@ -130,6 +131,7 @@ struct tap_filter {
#define MAX_TAP_FLOWS 4096

#define TUN_FLOW_EXPIRE (3 * HZ)
+#define TUN_RING_SIZE 256

struct tun_pcpu_stats {
u64 rx_packets;
@@ -167,6 +169,7 @@ struct tun_file {
};
struct list_head next;
struct tun_struct *detached;
+ struct skb_array tx_array;
};

struct tun_flow_entry {
@@ -513,8 +516,15 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
return tun;
}

-static void tun_queue_purge(struct tun_file *tfile)
+static void tun_queue_purge(struct tun_struct *tun, struct tun_file *tfile)
{
+ struct sk_buff *skb;
+
+ if (tun->flags & IFF_TX_ARRAY) {
+ while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+ kfree_skb(skb);
+ }
+
skb_queue_purge(&tfile->sk.sk_receive_queue);
skb_queue_purge(&tfile->sk.sk_error_queue);
}
@@ -545,7 +555,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
synchronize_net();
tun_flow_delete_by_queue(tun, tun->numqueues + 1);
/* Drop read queue */
- tun_queue_purge(tfile);
+ tun_queue_purge(tun, tfile);
tun_set_real_num_queues(tun);
} else if (tfile->detached && clean) {
tun = tun_enable_queue(tfile);
@@ -560,6 +570,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
+ if (tun && tun->flags & IFF_TX_ARRAY)
+ skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
@@ -596,12 +608,12 @@ static void tun_detach_all(struct net_device *dev)
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
/* Drop read queue */
- tun_queue_purge(tfile);
+ tun_queue_purge(tun, tfile);
sock_put(&tfile->sk);
}
list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
tun_enable_queue(tfile);
- tun_queue_purge(tfile);
+ tun_queue_purge(tun, tfile);
sock_put(&tfile->sk);
}
BUG_ON(tun->numdisabled != 0);
@@ -642,6 +654,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
if (!err)
goto out;
}
+
+ if (!tfile->detached && tun->flags & IFF_TX_ARRAY &&
+ skb_array_init(&tfile->tx_array, TUN_RING_SIZE, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
@@ -891,8 +910,13 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)

nf_reset(skb);

- /* Enqueue packet */
- skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+ if (tun->flags & IFF_TX_ARRAY) {
+ if (skb_array_produce(&tfile->tx_array, skb))
+ goto drop;
+ } else {
+ /* Enqueue packet */
+ skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+ }

/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
@@ -1088,6 +1112,17 @@ static void tun_net_init(struct net_device *dev)
}
}

+static int tun_queue_not_empty(struct tun_struct *tun,
+ struct tun_file *tfile)
+{
+ struct sock *sk = tfile->socket.sk;
+
+ if (tun->flags & IFF_TX_ARRAY)
+ return !skb_array_empty(&tfile->tx_array);
+ else
+ return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
/* Character device part */

/* Poll */
@@ -1107,7 +1142,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)

poll_wait(file, sk_sleep(sk), wait);

- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (tun_queue_not_empty(tun, tfile))
mask |= POLLIN | POLLRDNORM;

if (sock_writeable(sk) ||
@@ -1481,6 +1516,46 @@ done:
return total;
}

+static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
+ int *err)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb = NULL;
+
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ goto out;
+ if (noblock) {
+ *err = -EAGAIN;
+ goto out;
+ }
+
+ add_wait_queue(&tfile->wq.wait, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+
+ while (1) {
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ break;
+ if (signal_pending(current)) {
+ *err = -ERESTARTSYS;
+ break;
+ }
+ if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
+ *err = -EFAULT;
+ break;
+ }
+
+ schedule();
+ };
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&tfile->wq.wait, &wait);
+
+out:
+ return skb;
+}
+
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
@@ -1494,9 +1569,13 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
if (!iov_iter_count(to))
return 0;

- /* Read frames from queue */
- skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
- &peeked, &off, &err);
+ if (tun->flags & IFF_TX_ARRAY)
+ skb = tun_ring_recv(tfile, noblock, &err);
+ else
+ /* Read frames from queue */
+ skb = __skb_recv_datagram(tfile->socket.sk,
+ noblock ? MSG_DONTWAIT : 0,
+ &peeked, &off, &err);
if (!skb)
return err;

@@ -1629,8 +1708,39 @@ out:
return ret;
}

+static int tun_peek_len(struct socket *sock)
+{
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct sock *sk = sock->sk;
+ struct tun_struct *tun;
+ int ret = 0;
+
+ tun = __tun_get(tfile);
+ if (!tun)
+ return 0;
+
+ if (tun->flags & IFF_TX_ARRAY) {
+ ret = skb_array_peek_len(&tfile->tx_array);
+ } else {
+ struct sk_buff *head;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ head = skb_peek(&sk->sk_receive_queue);
+ if (likely(head)) {
+ ret = head->len;
+ if (skb_vlan_tag_present(head))
+ ret += VLAN_HLEN;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ tun_put(tun);
+ return ret;
+}
+
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
+ .peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
@@ -1643,7 +1753,8 @@ static struct proto tun_proto = {

static int tun_flags(struct tun_struct *tun)
{
- return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
+ return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN |
+ IFF_TAP | IFF_TX_ARRAY);
}

static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
@@ -1755,6 +1866,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
} else
return -EINVAL;

+ if (ifr->ifr_flags & IFF_TX_ARRAY)
+ flags |= IFF_TX_ARRAY;
+
if (*ifr->ifr_name)
name = ifr->ifr_name;

@@ -1995,7 +2109,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
* This is needed because we never checked for invalid flags on
* TUNSETIFF.
*/
- return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
+ return put_user(IFF_TUN | IFF_TAP | IFF_TX_ARRAY | TUN_FEATURES,
(unsigned int __user*)argp);
} else if (cmd == TUNSETQUEUE)
return tun_set_queue(file, &ifr);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f744eeb..236ba52 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -455,10 +455,14 @@ out:

static int peek_head_len(struct sock *sk)
{
+ struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;

+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
if (likely(head)) {
@@ -471,6 +475,16 @@ static int peek_head_len(struct sock *sk)
return len;
}

+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -487,7 +501,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
endtime = busy_clock() + vq->busyloop_timeout;

while (vhost_can_busy_poll(&net->dev, endtime) &&
- skb_queue_empty(&sk->sk_receive_queue) &&
+ !sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, vq))
cpu_relax_lowlatency();

diff --git a/include/linux/net.h b/include/linux/net.h
index 9aa49a0..b6b3843 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -185,6 +185,7 @@ struct proto_ops {
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int (*set_peek_off)(struct sock *sk, int val);
+ int (*peek_len)(struct socket *sock);
};

#define DECLARE_SOCKADDR(type, dst, src) \
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d..080003c 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@
#define IFF_TUN 0x0001
#define IFF_TAP 0x0002
#define IFF_NO_PI 0x1000
+#define IFF_TX_ARRAY 0x0010
/* This flag has no real effect */
#define IFF_ONE_QUEUE 0x2000
#define IFF_VNET_HDR 0x4000
--
2.7.4