Re: [PATCH v1 3/3] Let host NIC driver to DMA to guest user space.

From: Michael S. Tsirkin
Date: Mon Mar 08 2010 - 06:22:15 EST


On Sat, Mar 06, 2010 at 05:38:38PM +0800, xiaohui.xin@xxxxxxxxx wrote:
> From: Xin Xiaohui <xiaohui.xin@xxxxxxxxx>
>
> The patch let host NIC driver to receive user space skb,
> then the driver has chance to directly DMA to guest user
> space buffers thru single ethX interface.
>
> Signed-off-by: Xin Xiaohui <xiaohui.xin@xxxxxxxxx>
> Signed-off-by: Zhao Yu <yzhao81@xxxxxxxxx>
> Sigend-off-by: Jeff Dike <jdike@xxxxxxxxxxxxxxxxxxxxxx>


I have a feeling I commented on some of the below issues already.
Do you plan to send a version with comments addressed?

> ---
> include/linux/netdevice.h | 76 ++++++++++++++++++++++++++++++++++++++++++-
> include/linux/skbuff.h | 30 +++++++++++++++--
> net/core/dev.c | 32 ++++++++++++++++++
> net/core/skbuff.c | 79 +++++++++++++++++++++++++++++++++++++++++----
> 4 files changed, 205 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 94958c1..97bf12c 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -485,6 +485,17 @@ struct netdev_queue {
> unsigned long tx_dropped;
> } ____cacheline_aligned_in_smp;
>
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct mpassthru_port {
> + int hdr_len;
> + int data_len;
> + int npages;
> + unsigned flags;
> + struct socket *sock;
> + struct skb_user_page *(*ctor)(struct mpassthru_port *,
> + struct sk_buff *, int);
> +};
> +#endif
>
> /*
> * This structure defines the management hooks for network devices.
> @@ -636,6 +647,10 @@ struct net_device_ops {
> int (*ndo_fcoe_ddp_done)(struct net_device *dev,
> u16 xid);
> #endif
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + int (*ndo_mp_port_prep)(struct net_device *dev,
> + struct mpassthru_port *port);
> +#endif
> };
>
> /*
> @@ -891,7 +906,8 @@ struct net_device
> struct macvlan_port *macvlan_port;
> /* GARP */
> struct garp_port *garp_port;
> -
> + /* mpassthru */
> + struct mpassthru_port *mp_port;
> /* class/net/name entry */
> struct device dev;
> /* space for optional statistics and wireless sysfs groups */
> @@ -2013,6 +2029,62 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev)
> return 0;
> return dev->ethtool_ops->get_flags(dev);
> }
> -#endif /* __KERNEL__ */
>
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +static inline int netdev_mp_port_prep(struct net_device *dev,
> + struct mpassthru_port *port)
> +{

This function lacks documentation.

> + int rc;
> + int npages, data_len;
> + const struct net_device_ops *ops = dev->netdev_ops;
> +
> + /* needed by packet split */
> + if (ops->ndo_mp_port_prep) {
> + rc = ops->ndo_mp_port_prep(dev, port);
> + if (rc)
> + return rc;
> + } else { /* should be temp */
> + port->hdr_len = 128;
> + port->data_len = 2048;
> + port->npages = 1;

where do the numbers come from?

> + }
> +
> + if (port->hdr_len <= 0)
> + goto err;
> +
> + npages = port->npages;
> + data_len = port->data_len;
> + if (npages <= 0 || npages > MAX_SKB_FRAGS ||
> + (data_len < PAGE_SIZE * (npages - 1) ||
> + data_len > PAGE_SIZE * npages))
> + goto err;
> +
> + return 0;
> +err:
> + dev_warn(&dev->dev, "invalid page constructor parameters\n");
> +
> + return -EINVAL;
> +}
> +
> +static inline int netdev_mp_port_attach(struct net_device *dev,
> + struct mpassthru_port *port)
> +{
> + if (rcu_dereference(dev->mp_port))
> + return -EBUSY;
> +
> + rcu_assign_pointer(dev->mp_port, port);
> +
> + return 0;
> +}
> +
> +static inline void netdev_mp_port_detach(struct net_device *dev)
> +{
> + if (!rcu_dereference(dev->mp_port))
> + return;
> +
> + rcu_assign_pointer(dev->mp_port, NULL);
> + synchronize_rcu();
> +}

The above looks wrong, rcu_dereference should be called
under rcu read side, rcu_assign_pointer usually should not,
synchronize_rcu definitely should not.

As I suggested already, these functions are better opencoded,
rcu is tricky as is without hiding it in inline helpers.

> +#endif /* CONFIG_VHOST_PASSTHRU */
> +#endif /* __KERNEL__ */
> #endif /* _LINUX_NETDEVICE_H */
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index df7b23a..e59fa57 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -209,6 +209,13 @@ struct skb_shared_info {
> void * destructor_arg;
> };
>
> +struct skb_user_page {
> + u8 *start;
> + int size;
> + struct skb_frag_struct *frags;
> + struct skb_shared_info *ushinfo;
> + void (*dtor)(struct skb_user_page *);
> +};
> /* We divide dataref into two halves. The higher 16 bits hold references
> * to the payload part of skb->data. The lower 16 bits hold references to
> * the entire skb->data. A clone of a headerless skb holds the length of
> @@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
> extern void consume_skb(struct sk_buff *skb);
> extern void __kfree_skb(struct sk_buff *skb);
> extern struct sk_buff *__alloc_skb(unsigned int size,
> - gfp_t priority, int fclone, int node);
> + gfp_t priority, int fclone,
> + int node, struct net_device *dev);
> static inline struct sk_buff *alloc_skb(unsigned int size,
> gfp_t priority)
> {
> - return __alloc_skb(size, priority, 0, -1);
> + return __alloc_skb(size, priority, 0, -1, NULL);
> }
>
> static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
> gfp_t priority)
> {
> - return __alloc_skb(size, priority, 1, -1);
> + return __alloc_skb(size, priority, 1, -1, NULL);
> }
>
> extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
> @@ -1509,6 +1517,22 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page)
> __free_page(page);
> }
>
> +extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
> + struct sk_buff *skb, int npages);
> +
> +static inline struct skb_user_page *netdev_alloc_user_page(
> + struct net_device *dev,
> + struct sk_buff *skb, unsigned int size)
> +{
> + struct skb_user_page *user;
> + int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);

Should round up to full pages?

> +
> + user = netdev_alloc_user_pages(dev, skb, npages);
> + if (likely(user))
> + return user;
> + return NULL;
> +}
> +
> /**
> * skb_clone_writable - is the header of a clone writable
> * @skb: buffer to check
> diff --git a/net/core/dev.c b/net/core/dev.c
> index b8f74cf..ab8b082 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2265,6 +2265,30 @@ void netif_nit_deliver(struct sk_buff *skb)
> rcu_read_unlock();
> }
>
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
> + struct packet_type **pt_prev,
> + int *ret, struct net_device *orig_dev)

please document

pt_prev, orig_dev and ret seem unused?

> +{
> + struct mpassthru_port *ctor = NULL;

Why do you call the port "ctor"?

> + struct sock *sk = NULL;
> +
> + if (skb->dev)
> + ctor = skb->dev->mp_port;
> + if (!ctor)
> + return skb;
> +
> + sk = ctor->sock->sk;
> +
> + skb_queue_tail(&sk->sk_receive_queue, skb);
> +
> + sk->sk_data_ready(sk, skb->len);
> + return NULL;
> +}
> +#else
> +#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
> +#endif
> +
> /**
> * netif_receive_skb - process receive buffer from network
> * @skb: buffer to process
> @@ -2342,6 +2366,9 @@ int netif_receive_skb(struct sk_buff *skb)
> goto out;
> ncls:
> #endif
> + skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
> + if (!skb)
> + goto out;
>
> skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
> if (!skb)
> @@ -2455,6 +2482,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> if (skb_is_gso(skb) || skb_has_frags(skb))
> goto normal;
>
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + if (skb->dev && skb->dev->mp_port)
> + goto normal;
> +#endif
> +
> rcu_read_lock();
> list_for_each_entry_rcu(ptype, head, list) {
> if (ptype->type != type || ptype->dev || !ptype->gro_receive)
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 80a9616..6510e5b 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -170,13 +170,15 @@ EXPORT_SYMBOL(skb_under_panic);
> * %GFP_ATOMIC.
> */
> struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> - int fclone, int node)
> + int fclone, int node, struct net_device *dev)
> {
> struct kmem_cache *cache;
> struct skb_shared_info *shinfo;
> struct sk_buff *skb;
> u8 *data;
> -
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + struct skb_user_page *user = NULL;
> +#endif
> cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>
> /* Get the HEAD */
> @@ -185,8 +187,26 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> goto out;
>
> size = SKB_DATA_ALIGN(size);
> - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
> - gfp_mask, node);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + if (!dev || !dev->mp_port) { /* Legacy alloc func */
> +#endif
> + data = kmalloc_node_track_caller(
> + size + sizeof(struct skb_shared_info),
> + gfp_mask, node);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + } else { /* Allocation may from page constructor of device */

what does the comment mean?

> + user = netdev_alloc_user_page(dev, skb, size);
> + if (!user) {
> + data = kmalloc_node_track_caller(
> + size + sizeof(struct skb_shared_info),
> + gfp_mask, node);
> + printk(KERN_INFO "can't alloc user buffer.\n");
> + } else {
> + data = user->start;
> + size = SKB_DATA_ALIGN(user->size);
> + }
> + }
> +#endif
> if (!data)
> goto nodata;
>
> @@ -208,6 +228,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> skb->mac_header = ~0U;
> #endif
>
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + if (user)
> + memcpy(user->ushinfo, skb_shinfo(skb),
> + sizeof(struct skb_shared_info));
> +#endif
> /* make sure we initialize shinfo sequentially */
> shinfo = skb_shinfo(skb);
> atomic_set(&shinfo->dataref, 1);
> @@ -231,6 +256,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
>
> child->fclone = SKB_FCLONE_UNAVAILABLE;
> }
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + shinfo->destructor_arg = user;
> +#endif
> +
> out:
> return skb;
> nodata:
> @@ -259,7 +288,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
> int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
> struct sk_buff *skb;
>
> - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
> + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
> if (likely(skb)) {
> skb_reserve(skb, NET_SKB_PAD);
> skb->dev = dev;
> @@ -278,6 +307,29 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
> }
> EXPORT_SYMBOL(__netdev_alloc_page);
>
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
> + struct sk_buff *skb, int npages)
> +{
> + struct mpassthru_port *ctor;
> + struct skb_user_page *user = NULL;
> +
> + rcu_read_lock();
> + ctor = rcu_dereference(dev->mp_port);
> + if (!ctor)
> + goto out;
> +
> + BUG_ON(npages > ctor->npages);
> +
> + user = ctor->ctor(ctor, skb, npages);

With the assumption that "ctor" pins userspace pages,
can't it sleep? If yes you can't call it under rcu read side
critical section.

> +out:
> + rcu_read_unlock();
> +
> + return user;
> +}
> +EXPORT_SYMBOL(netdev_alloc_user_pages);
> +#endif
> +
> void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
> int size)
> {
> @@ -338,6 +390,10 @@ static void skb_clone_fraglist(struct sk_buff *skb)
>
> static void skb_release_data(struct sk_buff *skb)
> {
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
> +#endif
> +
> if (!skb->cloned ||
> !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
> &skb_shinfo(skb)->dataref)) {
> @@ -349,7 +405,10 @@ static void skb_release_data(struct sk_buff *skb)
>
> if (skb_has_frags(skb))
> skb_drop_fraglist(skb);
> -
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + if (skb->dev && skb->dev->mp_port && user && user->dtor)
> + user->dtor(user);
> +#endif
> kfree(skb->head);
> }
> }
> @@ -503,8 +562,14 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
> if (skb_shared(skb) || skb_cloned(skb))
> return 0;
>
> - skb_release_head_state(skb);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> + if (skb->dev && skb->dev->mp_port)
> + return 0;
> +#endif
> +
> shinfo = skb_shinfo(skb);
> +
> + skb_release_head_state(skb);
> atomic_set(&shinfo->dataref, 1);
> shinfo->nr_frags = 0;
> shinfo->gso_size = 0;
> --
> 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/