Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

From: Michael S. Tsirkin
Date: Thu Apr 01 2010 - 07:12:49 EST


On Thu, Apr 01, 2010 at 05:27:18PM +0800, Xin Xiaohui wrote:
> Add a device to utilize the vhost-net backend driver for
> copy-less data transfer between guest FE and host NIC.
> It pins the guest user space to the host memory and
> provides proto_ops as sendmsg/recvmsg to vhost-net.
>
> Signed-off-by: Xin Xiaohui <xiaohui.xin@xxxxxxxxx>
> Signed-off-by: Zhao Yu <yzhao81@xxxxxxxxx>
> Sigend-off-by: Jeff Dike <jdike@xxxxxxxxxxxxxxxxxxxxxx>
> ---
>
> Micheal,
> Sorry, I did not resolve all your comments this time.
> I did not move the device out of vhost directory because I
> did not implement real asynchronous read/write operations
> to mp device for now, We wish we can do this after the network
> code checked in.

Well, placement of code is not such a major issue.
It's just that code under drivers/net gets more and better
review than drivers/vhost. I'll try to get Dave's opinion.

>
> For the DOS issue, I'm not sure how much the limit get_user_pages()
> can pin is reasonable, should we compute the bindwidth to make it?

There's a ulimit for locked memory. Can we use this, decreasing
the value for rlimit array? We can do this when backend is
enabled and re-increment when backend is disabled.

> We use get_user_pages_fast() and use set_page_dirty_lock().
> Remove read_rcu_lock()/unlock(), since the ctor pointer is
> only changed by BIND/UNBIND ioctl, and during that time,
> the NIC is always stoped, all outstanding requests are done,
> so the ctor pointer cannot be raced into wrong condition.
>
> Qemu needs a userspace write, is that a synchronous one or
> asynchronous one?

It's a synchronous non-blocking write.

> Thanks
> Xiaohui
>
> drivers/vhost/Kconfig | 5 +
> drivers/vhost/Makefile | 2 +
> drivers/vhost/mpassthru.c | 1162 +++++++++++++++++++++++++++++++++++++++++++++
> include/linux/mpassthru.h | 29 ++
> 4 files changed, 1198 insertions(+), 0 deletions(-)
> create mode 100644 drivers/vhost/mpassthru.c
> create mode 100644 include/linux/mpassthru.h
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index 9f409f4..ee32a3b 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -9,3 +9,8 @@ config VHOST_NET
> To compile this driver as a module, choose M here: the module will
> be called vhost_net.
>
> +config VHOST_PASSTHRU
> + tristate "Zerocopy network driver (EXPERIMENTAL)"
> + depends on VHOST_NET
> + ---help---
> + zerocopy network I/O support
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 72dd020..3f79c79 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -1,2 +1,4 @@
> obj-$(CONFIG_VHOST_NET) += vhost_net.o
> vhost_net-y := vhost.o net.o
> +
> +obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
> diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
> new file mode 100644
> index 0000000..6e8fc4d
> --- /dev/null
> +++ b/drivers/vhost/mpassthru.c
> @@ -0,0 +1,1162 @@
> +/*
> + * MPASSTHRU - Mediate passthrough device.
> + * Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + */
> +
> +#define DRV_NAME "mpassthru"
> +#define DRV_DESCRIPTION "Mediate passthru device driver"
> +#define DRV_COPYRIGHT "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
> +
> +#include <linux/module.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/major.h>
> +#include <linux/slab.h>
> +#include <linux/smp_lock.h>
> +#include <linux/poll.h>
> +#include <linux/fcntl.h>
> +#include <linux/init.h>
> +#include <linux/aio.h>
> +
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <linux/etherdevice.h>
> +#include <linux/miscdevice.h>
> +#include <linux/ethtool.h>
> +#include <linux/rtnetlink.h>
> +#include <linux/if.h>
> +#include <linux/if_arp.h>
> +#include <linux/if_ether.h>
> +#include <linux/crc32.h>
> +#include <linux/nsproxy.h>
> +#include <linux/uaccess.h>
> +#include <linux/virtio_net.h>
> +#include <linux/mpassthru.h>
> +#include <net/net_namespace.h>
> +#include <net/netns/generic.h>
> +#include <net/rtnetlink.h>
> +#include <net/sock.h>
> +
> +#include <asm/system.h>
> +
> +#include "vhost.h"
> +
> +/* Uncomment to enable debugging */
> +/* #define MPASSTHRU_DEBUG 1 */
> +
> +#ifdef MPASSTHRU_DEBUG
> +static int debug;
> +
> +#define DBG if (mp->debug) printk
> +#define DBG1 if (debug == 2) printk
> +#else
> +#define DBG(a...)
> +#define DBG1(a...)
> +#endif
> +
> +#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
> +#define COPY_HDR_LEN (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
> +
> +struct frag {
> + u16 offset;
> + u16 size;
> +};
> +
> +struct page_ctor {
> + struct list_head readq;
> + int w_len;
> + int r_len;
> + spinlock_t read_lock;
> + struct kmem_cache *cache;
> + struct net_device *dev;
> + struct mpassthru_port port;
> +};
> +
> +struct page_info {
> + void *ctrl;
> + struct list_head list;
> + int header;
> + /* indicate the actual length of bytes
> + * send/recv in the user space buffers
> + */
> + int total;
> + int offset;
> + struct page *pages[MAX_SKB_FRAGS+1];
> + struct skb_frag_struct frag[MAX_SKB_FRAGS+1];
> + struct sk_buff *skb;
> + struct page_ctor *ctor;
> +
> + /* The pointer relayed to skb, to indicate
> + * it's a user space allocated skb or kernel
> + */
> + struct skb_user_page user;
> + struct skb_shared_info ushinfo;
> +
> +#define INFO_READ 0
> +#define INFO_WRITE 1
> + unsigned flags;
> + unsigned pnum;
> +
> + /* It's meaningful for receive, means
> + * the max length allowed
> + */
> + size_t len;
> +
> + /* The fields after that is for backend
> + * driver, now for vhost-net.
> + */
> +
> + struct kiocb *iocb;
> + unsigned int desc_pos;
> + unsigned int log;
> + struct iovec hdr[VHOST_NET_MAX_SG];
> + struct iovec iov[VHOST_NET_MAX_SG];
> +};
> +
> +struct mp_struct {
> + struct mp_file *mfile;
> + struct net_device *dev;
> + struct page_ctor *ctor;
> + struct socket socket;
> +
> +#ifdef MPASSTHRU_DEBUG
> + int debug;
> +#endif
> +};
> +
> +struct mp_file {
> + atomic_t count;
> + struct mp_struct *mp;
> + struct net *net;
> +};
> +
> +struct mp_sock {
> + struct sock sk;
> + struct mp_struct *mp;
> +};
> +
> +static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
> +{
> + int ret = 0;
> +
> + rtnl_lock();
> + ret = dev_change_flags(dev, flags);
> + rtnl_unlock();
> +
> + if (ret < 0)
> + printk(KERN_ERR "failed to change dev state of %s", dev->name);
> +
> + return ret;
> +}
> +
> +/* The main function to allocate user space buffers */
> +static struct skb_user_page *page_ctor(struct mpassthru_port *port,
> + struct sk_buff *skb, int npages)
> +{
> + int i;
> + unsigned long flags;
> + struct page_ctor *ctor;
> + struct page_info *info = NULL;
> +
> + ctor = container_of(port, struct page_ctor, port);
> +
> + spin_lock_irqsave(&ctor->read_lock, flags);
> + if (!list_empty(&ctor->readq)) {
> + info = list_first_entry(&ctor->readq, struct page_info, list);
> + list_del(&info->list);
> + }
> + spin_unlock_irqrestore(&ctor->read_lock, flags);
> + if (!info)
> + return NULL;
> +
> + for (i = 0; i < info->pnum; i++) {
> + get_page(info->pages[i]);
> + info->frag[i].page = info->pages[i];
> + info->frag[i].page_offset = i ? 0 : info->offset;
> + info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
> + port->data_len;
> + }
> + info->skb = skb;
> + info->user.frags = info->frag;
> + info->user.ushinfo = &info->ushinfo;
> + return &info->user;
> +}
> +
> +static void mp_ki_dtor(struct kiocb *iocb)
> +{
> + struct page_info *info = (struct page_info *)(iocb->private);
> + int i;
> +
> + for (i = 0; i < info->pnum; i++) {
> + if (info->pages[i])
> + put_page(info->pages[i]);
> + }
> +
> + if (info->flags == INFO_READ) {
> + skb_shinfo(info->skb)->destructor_arg = &info->user;
> + info->skb->destructor = NULL;
> + kfree_skb(info->skb);
> + }
> +
> + kmem_cache_free(info->ctor->cache, info);
> +
> + return;
> +}
> +
> +static struct kiocb *create_iocb(struct page_info *info, int size)
> +{
> + struct kiocb *iocb = NULL;
> +
> + iocb = info->iocb;
> + if (!iocb)
> + return iocb;
> + iocb->ki_flags = 0;
> + iocb->ki_users = 1;
> + iocb->ki_key = 0;
> + iocb->ki_ctx = NULL;
> + iocb->ki_cancel = NULL;
> + iocb->ki_retry = NULL;
> + iocb->ki_iovec = NULL;
> + iocb->ki_eventfd = NULL;
> + iocb->private = (void *)info;
> + iocb->ki_pos = info->desc_pos;
> + iocb->ki_nbytes = size;
> + iocb->ki_user_data = info->log;
> + iocb->ki_dtor = mp_ki_dtor;
> + return iocb;
> +}
> +
> +/* A helper to clean the skb before the kfree_skb() */
> +
> +static void page_dtor_prepare(struct page_info *info)
> +{
> + if (info->flags == INFO_READ)
> + if (info->skb)
> + info->skb->head = NULL;
> +}
> +
> +/* The callback to destruct the user space buffers or skb */
> +static void page_dtor(struct skb_user_page *user)
> +{
> + struct page_info *info;
> + struct page_ctor *ctor;
> + struct sock *sk;
> + struct sk_buff *skb;
> + struct kiocb *iocb = NULL;
> + struct vhost_virtqueue *vq = NULL;
> + unsigned long flags;
> + int i;
> +
> + if (!user)
> + return;
> + info = container_of(user, struct page_info, user);
> + if (!info)
> + return;
> + ctor = info->ctor;
> + skb = info->skb;
> +
> + page_dtor_prepare(info);
> +
> + /* If the info->total is 0, make it to be reused */
> + if (!info->total) {
> + spin_lock_irqsave(&ctor->read_lock, flags);
> + list_add(&info->list, &ctor->readq);
> + spin_unlock_irqrestore(&ctor->read_lock, flags);
> + return;
> + }
> +
> + if (info->flags == INFO_READ)
> + return;
> +
> + /* For transmit, we should wait for the DMA finish by hardware.
> + * Queue the notifier to wake up the backend driver
> + */
> + vq = (struct vhost_virtqueue *)info->ctrl;
> + iocb = create_iocb(info, info->total);
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + list_add_tail(&iocb->ki_list, &vq->notifier);
> + spin_unlock_irqrestore(&vq->notify_lock, flags);
> +
> + sk = ctor->port.sock->sk;
> + sk->sk_write_space(sk);
> +
> + return;
> +}
> +
> +static int page_ctor_attach(struct mp_struct *mp)
> +{
> + int rc;
> + struct page_ctor *ctor;
> + struct net_device *dev = mp->dev;
> +
> + /* locked by mp_mutex */
> + if (rcu_dereference(mp->ctor))
> + return -EBUSY;
> +
> + ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
> + if (!ctor)
> + return -ENOMEM;
> + rc = netdev_mp_port_prep(dev, &ctor->port);
> + if (rc)
> + goto fail;
> +
> + ctor->cache = kmem_cache_create("skb_page_info",
> + sizeof(struct page_info), 0,
> + SLAB_HWCACHE_ALIGN, NULL);
> +
> + if (!ctor->cache)
> + goto cache_fail;
> +
> + INIT_LIST_HEAD(&ctor->readq);
> + spin_lock_init(&ctor->read_lock);
> +
> + ctor->w_len = 0;
> + ctor->r_len = 0;
> +
> + dev_hold(dev);
> + ctor->dev = dev;
> + ctor->port.ctor = page_ctor;
> + ctor->port.sock = &mp->socket;
> +
> + rc = netdev_mp_port_attach(dev, &ctor->port);
> + if (rc)
> + goto fail;
> +
> + /* locked by mp_mutex */
> + rcu_assign_pointer(mp->ctor, ctor);
> +
> + /* XXX:Need we do set_offload here ? */
> +
> + return 0;
> +
> +fail:
> + kmem_cache_destroy(ctor->cache);
> +cache_fail:
> + kfree(ctor);
> + dev_put(dev);
> +
> + return rc;
> +}
> +
> +struct page_info *info_dequeue(struct page_ctor *ctor)
> +{
> + unsigned long flags;
> + struct page_info *info = NULL;
> + spin_lock_irqsave(&ctor->read_lock, flags);
> + if (!list_empty(&ctor->readq)) {
> + info = list_first_entry(&ctor->readq,
> + struct page_info, list);
> + list_del(&info->list);
> + }
> + spin_unlock_irqrestore(&ctor->read_lock, flags);
> + return info;
> +}
> +
> +static int page_ctor_detach(struct mp_struct *mp)
> +{
> + struct page_ctor *ctor;
> + struct page_info *info;
> + struct vhost_virtqueue *vq = NULL;
> + struct kiocb *iocb = NULL;
> + int i;
> + unsigned long flags;
> +
> + /* locked by mp_mutex */
> + ctor = rcu_dereference(mp->ctor);
> + if (!ctor)
> + return -ENODEV;
> +
> + while ((info = info_dequeue(ctor))) {
> + for (i = 0; i < info->pnum; i++)
> + if (info->pages[i])
> + put_page(info->pages[i]);
> + vq = (struct vhost_virtqueue *)(info->ctrl);
> + iocb = create_iocb(info, 0);
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + list_add_tail(&iocb->ki_list, &vq->notifier);
> + spin_unlock_irqrestore(&vq->notify_lock, flags);
> +
> + kmem_cache_free(ctor->cache, info);
> + }
> + kmem_cache_destroy(ctor->cache);
> + netdev_mp_port_detach(ctor->dev);
> + dev_put(ctor->dev);
> +
> + /* locked by mp_mutex */
> + rcu_assign_pointer(mp->ctor, NULL);
> + synchronize_rcu();
> +
> + kfree(ctor);
> + return 0;
> +}
> +
> +/* For small user space buffers transmit, we don't need to call
> + * get_user_pages().
> + */
> +static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
> + struct kiocb *iocb, int total)
> +{
> + struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
> +
> + if (!info)
> + return NULL;
> + info->total = total;
> + info->user.dtor = page_dtor;
> + info->ctor = ctor;
> + info->flags = INFO_WRITE;
> + info->iocb = iocb;
> + return info;
> +}
> +
> +/* The main function to transform the guest user space address
> + * to host kernel address via get_user_pages(). Thus the hardware
> + * can do DMA directly to the user space address.
> + */
> +static struct page_info *alloc_page_info(struct page_ctor *ctor,
> + struct kiocb *iocb, struct iovec *iov,
> + int count, struct frag *frags,
> + int npages, int total)
> +{
> + int rc;
> + int i, j, n = 0;
> + int len;
> + unsigned long base;
> + struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
> +
> + if (!info)
> + return NULL;
> +
> + for (i = j = 0; i < count; i++) {
> + base = (unsigned long)iov[i].iov_base;
> + len = iov[i].iov_len;
> +
> + if (!len)
> + continue;
> + n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> +
> + rc = get_user_pages_fast(base, n, npages ? 1 : 0,
> + &info->pages[j]);
> + if (rc != n)
> + goto failed;
> +
> + while (n--) {
> + frags[j].offset = base & ~PAGE_MASK;
> + frags[j].size = min_t(int, len,
> + PAGE_SIZE - frags[j].offset);
> + len -= frags[j].size;
> + base += frags[j].size;
> + j++;
> + }
> + }
> +
> +#ifdef CONFIG_HIGHMEM
> + if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
> + for (i = 0; i < j; i++) {
> + if (PageHighMem(info->pages[i]))
> + goto failed;
> + }
> + }
> +#endif
> +
> + info->total = total;
> + info->user.dtor = page_dtor;
> + info->ctor = ctor;
> + info->pnum = j;
> + info->iocb = iocb;
> + if (!npages)
> + info->flags = INFO_WRITE;
> + if (info->flags == INFO_READ) {
> + info->user.start = (u8 *)(((unsigned long)
> + (pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
> + frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD);
> + info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD;
> + for (i = 0; i < j; i++)
> + set_page_dirty_lock(info->pages[i]);
> + }
> + return info;
> +
> +failed:
> + for (i = 0; i < j; i++)
> + put_page(info->pages[i]);
> +
> + kmem_cache_free(ctor->cache, info);
> +
> + return NULL;
> +}
> +
> +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
> + struct msghdr *m, size_t total_len)
> +{
> + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + struct page_ctor *ctor;
> + struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
> + struct iovec *iov = m->msg_iov;
> + struct page_info *info = NULL;
> + struct frag frags[MAX_SKB_FRAGS];
> + struct sk_buff *skb;
> + int count = m->msg_iovlen;
> + int total = 0, header, n, i, len, rc;
> + unsigned long base;
> +
> + ctor = rcu_dereference(mp->ctor);
> + if (!ctor)
> + return -ENODEV;
> +
> + total = iov_length(iov, count);
> +
> + if (total < ETH_HLEN)
> + return -EINVAL;
> +
> + if (total <= COPY_THRESHOLD)
> + goto copy;
> +
> + n = 0;
> + for (i = 0; i < count; i++) {
> + base = (unsigned long)iov[i].iov_base;
> + len = iov[i].iov_len;
> + if (!len)
> + continue;
> + n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> + if (n > MAX_SKB_FRAGS)
> + return -EINVAL;
> + }
> +
> +copy:
> + header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
> +
> + skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
> + if (!skb)
> + goto drop;
> +
> + skb_reserve(skb, NET_IP_ALIGN);
> +
> + skb_set_network_header(skb, ETH_HLEN);
> +
> + memcpy_fromiovec(skb->data, iov, header);
> + skb_put(skb, header);
> + skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
> +
> + if (header == total) {
> + rc = total;
> + info = alloc_small_page_info(ctor, iocb, total);
> + } else {
> + info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
> + if (info)
> + for (i = 0; info->pages[i]; i++) {
> + skb_add_rx_frag(skb, i, info->pages[i],
> + frags[i].offset, frags[i].size);
> + info->pages[i] = NULL;
> + }
> + }
> + if (info != NULL) {
> + info->desc_pos = iocb->ki_pos;
> + info->ctrl = vq;
> + info->total = total;
> + info->skb = skb;
> + skb_shinfo(skb)->destructor_arg = &info->user;
> + skb->dev = mp->dev;
> + dev_queue_xmit(skb);
> + mp->dev->stats.tx_packets++;
> + mp->dev->stats.tx_bytes += total;
> + return 0;
> + }
> +drop:
> + kfree_skb(skb);
> + if (info) {
> + for (i = 0; info->pages[i]; i++)
> + put_page(info->pages[i]);
> + kmem_cache_free(info->ctor->cache, info);
> + }
> + mp->dev->stats.tx_dropped++;
> + return -ENOMEM;
> +}
> +
> +
> +static void mp_recvmsg_notify(struct vhost_virtqueue *vq)
> +{
> + struct socket *sock = vq->private_data;
> + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + struct page_ctor *ctor = NULL;
> + struct sk_buff *skb = NULL;
> + struct page_info *info = NULL;
> + struct ethhdr *eth;
> + struct kiocb *iocb = NULL;
> + int len, i;
> + unsigned long flags;
> +
> + struct virtio_net_hdr hdr = {
> + .flags = 0,
> + .gso_type = VIRTIO_NET_HDR_GSO_NONE
> + };
> +
> + ctor = rcu_dereference(mp->ctor);
> + if (!ctor)
> + return;
> +
> + while ((skb = skb_dequeue(&sock->sk->sk_receive_queue)) != NULL) {
> + if (skb_shinfo(skb)->destructor_arg) {
> + info = container_of(skb_shinfo(skb)->destructor_arg,
> + struct page_info, user);
> + info->skb = skb;
> + if (skb->len > info->len) {
> + mp->dev->stats.rx_dropped++;
> + DBG(KERN_INFO "Discarded truncated rx packet: "
> + " len %d > %zd\n", skb->len, info->len);
> + info->total = skb->len;
> + goto clean;
> + } else {
> + int i;
> + struct skb_shared_info *gshinfo =
> + (struct skb_shared_info *)(&info->ushinfo);
> + struct skb_shared_info *hshinfo =
> + skb_shinfo(skb);
> +
> + if (gshinfo->nr_frags < hshinfo->nr_frags)
> + goto clean;
> + eth = eth_hdr(skb);
> + skb_push(skb, ETH_HLEN);
> +
> + hdr.hdr_len = skb_headlen(skb);
> + info->total = skb->len;
> +
> + for (i = 0; i < gshinfo->nr_frags; i++)
> + gshinfo->frags[i].size = 0;
> + for (i = 0; i < hshinfo->nr_frags; i++)
> + gshinfo->frags[i].size =
> + hshinfo->frags[i].size;
> + memcpy(skb_shinfo(skb), &info->ushinfo,
> + sizeof(struct skb_shared_info));
> + }
> + } else {
> + /* The skb composed with kernel buffers
> + * in case user space buffers are not sufficent.
> + * The case should be rare.
> + */
> + unsigned long flags;
> + int i;
> + struct skb_shared_info *gshinfo = NULL;
> +
> + info = NULL;
> +
> + spin_lock_irqsave(&ctor->read_lock, flags);
> + if (!list_empty(&ctor->readq)) {
> + info = list_first_entry(&ctor->readq,
> + struct page_info, list);
> + list_del(&info->list);
> + }
> + spin_unlock_irqrestore(&ctor->read_lock, flags);
> + if (!info) {
> + DBG(KERN_INFO "No user buffer avaliable %p\n",
> + skb);
> + skb_queue_head(&sock->sk->sk_receive_queue,
> + skb);
> + break;
> + }
> + info->skb = skb;
> + /* compute the guest skb frags info */
> + gshinfo = (struct skb_shared_info *)(info->user.start +
> + SKB_DATA_ALIGN(info->user.size));
> +
> + if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
> + goto clean;
> +
> + eth = eth_hdr(skb);
> + skb_push(skb, ETH_HLEN);
> + info->total = skb->len;
> +
> + for (i = 0; i < gshinfo->nr_frags; i++)
> + gshinfo->frags[i].size = 0;
> + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> + gshinfo->frags[i].size =
> + skb_shinfo(skb)->frags[i].size;
> + hdr.hdr_len = min_t(int, skb->len,
> + info->iov[1].iov_len);
> + skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
> + }
> +
> + len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
> + sizeof hdr);
> + if (len) {
> + DBG(KERN_INFO
> + "Unable to write vnet_hdr at addr %p: %d\n",
> + info->hdr->iov_base, len);
> + goto clean;
> + }
> + iocb = create_iocb(info, skb->len + sizeof(hdr));
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + list_add_tail(&iocb->ki_list, &vq->notifier);
> + spin_unlock_irqrestore(&vq->notify_lock, flags);
> + continue;
> +
> +clean:
> + kfree_skb(skb);
> + for (i = 0; info->pages[i]; i++)
> + put_page(info->pages[i]);
> + kmem_cache_free(ctor->cache, info);
> + }
> + return;
> +}
> +
> +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
> + struct msghdr *m, size_t total_len,
> + int flags)
> +{
> + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + struct page_ctor *ctor;
> + struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
> + struct iovec *iov = m->msg_iov;
> + int count = m->msg_iovlen;
> + int npages, payload;
> + struct page_info *info;
> + struct frag frags[MAX_SKB_FRAGS];
> + unsigned long base;
> + int i, len;
> + unsigned long flag;
> +
> + if (!(flags & MSG_DONTWAIT))
> + return -EINVAL;
> +
> + ctor = rcu_dereference(mp->ctor);
> + if (!ctor)
> + return -EINVAL;
> +
> + /* Error detections in case invalid user space buffer */
> + if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
> + mp->dev->features & NETIF_F_SG) {
> + return -EINVAL;
> + }
> +
> + npages = ctor->port.npages;
> + payload = ctor->port.data_len;
> +
> + /* If KVM guest virtio-net FE driver use SG feature */
> + if (count > 2) {
> + for (i = 2; i < count; i++) {
> + base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
> + len = iov[i].iov_len;
> + if (npages == 1)
> + len = min_t(int, len, PAGE_SIZE - base);
> + else if (base)
> + break;
> + payload -= len;
> + if (payload <= 0)
> + goto proceed;
> + if (npages == 1 || (len & ~PAGE_MASK))
> + break;
> + }
> + }
> +
> + if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
> + - NET_SKB_PAD - NET_IP_ALIGN) >= 0)
> + goto proceed;
> +
> + return -EINVAL;
> +
> +proceed:
> + /* skip the virtnet head */
> + iov++;
> + count--;
> +
> + /* Translate address to kernel */
> + info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
> + if (!info)
> + return -ENOMEM;
> + info->len = total_len;
> + info->hdr[0].iov_base = vq->hdr[0].iov_base;
> + info->hdr[0].iov_len = vq->hdr[0].iov_len;
> + info->offset = frags[0].offset;
> + info->desc_pos = iocb->ki_pos;
> + info->log = iocb->ki_user_data;
> + info->ctrl = vq;
> +
> + iov--;
> + count++;
> +
> + memcpy(info->iov, vq->iov, sizeof(struct iovec) * count);
> +
> + spin_lock_irqsave(&ctor->read_lock, flag);
> + list_add_tail(&info->list, &ctor->readq);
> + spin_unlock_irqrestore(&ctor->read_lock, flag);
> +
> + if (!vq->receiver)
> + vq->receiver = mp_recvmsg_notify;
> +
> + return 0;
> +}
> +
> +static void __mp_detach(struct mp_struct *mp)
> +{
> + mp->mfile = NULL;
> +
> + mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
> + page_ctor_detach(mp);
> + mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
> +
> + /* Drop the extra count on the net device */
> + dev_put(mp->dev);
> +}
> +
> +static DEFINE_MUTEX(mp_mutex);
> +
> +static void mp_detach(struct mp_struct *mp)
> +{
> + mutex_lock(&mp_mutex);
> + __mp_detach(mp);
> + mutex_unlock(&mp_mutex);
> +}
> +
> +static void mp_put(struct mp_file *mfile)
> +{
> + if (atomic_dec_and_test(&mfile->count))
> + mp_detach(mfile->mp);
> +}
> +
> +static int mp_release(struct socket *sock)
> +{
> + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + struct mp_file *mfile = mp->mfile;
> +
> + mp_put(mfile);
> + sock_put(mp->socket.sk);
> + put_net(mfile->net);
> +
> + return 0;
> +}
> +
> +/* Ops structure to mimic raw sockets with mp device */
> +static const struct proto_ops mp_socket_ops = {
> + .sendmsg = mp_sendmsg,
> + .recvmsg = mp_recvmsg,
> + .release = mp_release,
> +};
> +
> +static struct proto mp_proto = {
> + .name = "mp",
> + .owner = THIS_MODULE,
> + .obj_size = sizeof(struct mp_sock),
> +};
> +
> +static int mp_chr_open(struct inode *inode, struct file * file)
> +{
> + struct mp_file *mfile;
> + cycle_kernel_lock();
> + DBG1(KERN_INFO "mp: mp_chr_open\n");
> +
> + mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
> + if (!mfile)
> + return -ENOMEM;
> + atomic_set(&mfile->count, 0);
> + mfile->mp = NULL;
> + mfile->net = get_net(current->nsproxy->net_ns);
> + file->private_data = mfile;
> + return 0;
> +}
> +
> +
> +static struct mp_struct *mp_get(struct mp_file *mfile)
> +{
> + struct mp_struct *mp = NULL;
> + if (atomic_inc_not_zero(&mfile->count))
> + mp = mfile->mp;
> +
> + return mp;
> +}
> +
> +
> +static int mp_attach(struct mp_struct *mp, struct file *file)
> +{
> + struct mp_file *mfile = file->private_data;
> + int err;
> +
> + netif_tx_lock_bh(mp->dev);
> +
> + err = -EINVAL;
> +
> + if (mfile->mp)
> + goto out;
> +
> + err = -EBUSY;
> + if (mp->mfile)
> + goto out;
> +
> + err = 0;
> + mfile->mp = mp;
> + mp->mfile = mfile;
> + mp->socket.file = file;
> + dev_hold(mp->dev);
> + sock_hold(mp->socket.sk);
> + atomic_inc(&mfile->count);
> +
> +out:
> + netif_tx_unlock_bh(mp->dev);
> + return err;
> +}
> +
> +static void mp_sock_destruct(struct sock *sk)
> +{
> + struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
> + kfree(mp);
> +}
> +
> +static int do_unbind(struct mp_file *mfile)
> +{
> + struct mp_struct *mp = mp_get(mfile);
> +
> + if (!mp)
> + return -EINVAL;
> +
> + mp_detach(mp);
> + sock_put(mp->socket.sk);
> + mp_put(mfile);
> + return 0;
> +}
> +
> +static void mp_sock_data_ready(struct sock *sk, int len)
> +{
> + if (sk_has_sleeper(sk))
> + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
> +}
> +
> +static void mp_sock_write_space(struct sock *sk)
> +{
> + if (sk_has_sleeper(sk))
> + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
> +}
> +
> +static long mp_chr_ioctl(struct file *file, unsigned int cmd,
> + unsigned long arg)
> +{
> + struct mp_file *mfile = file->private_data;
> + struct mp_struct *mp;
> + struct net_device *dev;
> + void __user* argp = (void __user *)arg;
> + struct ifreq ifr;
> + struct sock *sk;
> + int ret;
> +
> + ret = -EINVAL;
> +
> + switch (cmd) {
> + case MPASSTHRU_BINDDEV:
> + ret = -EFAULT;
> + if (copy_from_user(&ifr, argp, sizeof ifr))
> + break;
> +
> + ifr.ifr_name[IFNAMSIZ-1] = '\0';
> +
> + ret = -EBUSY;
> +
> + if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
> + break;
> +
> + ret = -ENODEV;
> + dev = dev_get_by_name(mfile->net, ifr.ifr_name);
> + if (!dev)
> + break;
> +
> + mutex_lock(&mp_mutex);
> +
> + ret = -EBUSY;
> + mp = mfile->mp;
> + if (mp)
> + goto err_dev_put;
> +
> + mp = kzalloc(sizeof(*mp), GFP_KERNEL);
> + if (!mp) {
> + ret = -ENOMEM;
> + goto err_dev_put;
> + }
> + mp->dev = dev;
> + ret = -ENOMEM;
> +
> + sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
> + if (!sk)
> + goto err_free_mp;
> +
> + init_waitqueue_head(&mp->socket.wait);
> + mp->socket.ops = &mp_socket_ops;
> + sock_init_data(&mp->socket, sk);
> + sk->sk_sndbuf = INT_MAX;
> + container_of(sk, struct mp_sock, sk)->mp = mp;
> +
> + sk->sk_destruct = mp_sock_destruct;
> + sk->sk_data_ready = mp_sock_data_ready;
> + sk->sk_write_space = mp_sock_write_space;
> +
> + ret = mp_attach(mp, file);
> + if (ret < 0)
> + goto err_free_sk;
> +
> + ret = page_ctor_attach(mp);
> + if (ret < 0)
> + goto err_free_sk;
> +
> + ifr.ifr_flags |= IFF_MPASSTHRU_EXCL;
> + mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
> +out:
> + mutex_unlock(&mp_mutex);
> + break;
> +err_free_sk:
> + sk_free(sk);
> +err_free_mp:
> + kfree(mp);
> +err_dev_put:
> + dev_put(dev);
> + goto out;
> +
> + case MPASSTHRU_UNBINDDEV:
> + ret = do_unbind(mfile);
> + break;
> +
> + default:
> + break;
> + }
> + return ret;
> +}
> +
> +static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
> +{
> + struct mp_file *mfile = file->private_data;
> + struct mp_struct *mp = mp_get(mfile);
> + struct sock *sk;
> + unsigned int mask = 0;
> +
> + if (!mp)
> + return POLLERR;
> +
> + sk = mp->socket.sk;
> +
> + poll_wait(file, &mp->socket.wait, wait);
> +
> + if (!skb_queue_empty(&sk->sk_receive_queue))
> + mask |= POLLIN | POLLRDNORM;
> +
> + if (sock_writeable(sk) ||
> + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
> + sock_writeable(sk)))
> + mask |= POLLOUT | POLLWRNORM;
> +
> + if (mp->dev->reg_state != NETREG_REGISTERED)
> + mask = POLLERR;
> +
> + mp_put(mfile);
> + return mask;
> +}
> +
> +static int mp_chr_close(struct inode *inode, struct file *file)
> +{
> + struct mp_file *mfile = file->private_data;
> +
> + /*
> + * Ignore return value since an error only means there was nothing to
> + * do
> + */
> + do_unbind(mfile);
> +
> + put_net(mfile->net);
> + kfree(mfile);
> +
> + return 0;
> +}
> +
> +static const struct file_operations mp_fops = {
> + .owner = THIS_MODULE,
> + .llseek = no_llseek,
> + .poll = mp_chr_poll,
> + .unlocked_ioctl = mp_chr_ioctl,
> + .open = mp_chr_open,
> + .release = mp_chr_close,
> +};
> +
> +static struct miscdevice mp_miscdev = {
> + .minor = MISC_DYNAMIC_MINOR,
> + .name = "mp",
> + .nodename = "net/mp",
> + .fops = &mp_fops,
> +};
> +
> +static int mp_device_event(struct notifier_block *unused,
> + unsigned long event, void *ptr)
> +{
> + struct net_device *dev = ptr;
> + struct mpassthru_port *port;
> + struct mp_struct *mp = NULL;
> + struct socket *sock = NULL;
> +
> + port = dev->mp_port;
> + if (port == NULL)
> + return NOTIFY_DONE;
> +
> + switch (event) {
> + case NETDEV_UNREGISTER:
> + sock = dev->mp_port->sock;
> + mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + do_unbind(mp->mfile);
> + break;
> + }
> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block mp_notifier_block __read_mostly = {
> + .notifier_call = mp_device_event,
> +};
> +
> +static int mp_init(void)
> +{
> + int ret = 0;
> +
> + ret = misc_register(&mp_miscdev);
> + if (ret)
> + printk(KERN_ERR "mp: Can't register misc device\n");
> + else {
> + printk(KERN_INFO "Registering mp misc device - minor = %d\n",
> + mp_miscdev.minor);
> + register_netdevice_notifier(&mp_notifier_block);
> + }
> + return ret;
> +}
> +
> +void mp_cleanup(void)
> +{
> + unregister_netdevice_notifier(&mp_notifier_block);
> + misc_deregister(&mp_miscdev);
> +}
> +
> +/* Get an underlying socket object from mp file. Returns error unless file is
> + * attached to a device. The returned object works like a packet socket, it
> + * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
> + * holding a reference to the file for as long as the socket is in use. */
> +struct socket *mp_get_socket(struct file *file)
> +{
> + struct mp_file *mfile = file->private_data;
> + struct mp_struct *mp;
> +
> + if (file->f_op != &mp_fops)
> + return ERR_PTR(-EINVAL);
> + mp = mp_get(mfile);
> + if (!mp)
> + return ERR_PTR(-EBADFD);
> + mp_put(mfile);
> + return &mp->socket;
> +}
> +EXPORT_SYMBOL_GPL(mp_get_socket);
> +
> +module_init(mp_init);
> +module_exit(mp_cleanup);
> +MODULE_AUTHOR(DRV_COPYRIGHT);
> +MODULE_DESCRIPTION(DRV_DESCRIPTION);
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
> new file mode 100644
> index 0000000..2be21c5
> --- /dev/null
> +++ b/include/linux/mpassthru.h
> @@ -0,0 +1,29 @@
> +#ifndef __MPASSTHRU_H
> +#define __MPASSTHRU_H
> +
> +#include <linux/types.h>
> +#include <linux/if_ether.h>
> +
> +/* ioctl defines */
> +#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
> +#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
> +
> +/* MPASSTHRU ifc flags */
> +#define IFF_MPASSTHRU 0x0001
> +#define IFF_MPASSTHRU_EXCL 0x0002
> +
> +#ifdef __KERNEL__
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct socket *mp_get_socket(struct file *);
> +#else
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +struct file;
> +struct socket;
> +static inline struct socket *mp_get_socket(struct file *f)
> +{
> + return ERR_PTR(-EINVAL);
> +}
> +#endif /* CONFIG_VHOST_PASSTHRU */
> +#endif /* __KERNEL__ */
> +#endif /* __MPASSTHRU_H */
> --
> 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/