RE: [PATCH] VFIO/VMBUS: Add VFIO VMBUS driver support

From: Michael Kelley
Date: Tue Nov 19 2019 - 18:37:50 EST


From: lantianyu1986@xxxxxxxxx <lantianyu1986@xxxxxxxxx> Sent: Monday, November 11, 2019 12:45 AM
>
> This patch is to add VFIO VMBUS driver support in order to expose
> VMBUS devices to user space drivers(Reference Hyper-V UIO driver).
> DPDK now has netvsc PMD driver support and it may get VMBUS resources
> via VFIO interface with new driver support.
>
> So far, Hyper-V doesn't provide virtual IOMMU support and so this
> driver needs to be used with VFIO noiommu mode.
>
> Current netvsc PMD driver in DPDK doesn't have IRQ mode support.
> This driver version skips IRQ control part and adds later when
> there is a real request.

Let me suggest some cleaned up wording for the commit message:

Add a VFIO VMBus driver to expose VMBus devices to user-space
drivers in a manner similar to the Hyper-V UIO driver. For example,
DPDK has a netvsc Poll-Mode Driver (PMD) and it can get VMBus
resources via the VFIO interface with this new driver.

Hyper-V doesn't provide a virtual IOMMU in guest VMs, so this
driver must be used in VFIO noiommu mode.

The current netvsc PMD driver in DPDK doesn't use IRQ mode so this
driver does not implement IRQ control. IRQ control can be added
later when there is a PMD driver that needs it.

>
> Signed-off-by: Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx>
> ---
> MAINTAINERS | 1 +
> drivers/vfio/Kconfig | 1 +
> drivers/vfio/Makefile | 1 +
> drivers/vfio/vmbus/Kconfig | 9 +
> drivers/vfio/vmbus/vfio_vmbus.c | 407
> ++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 12 ++
> 6 files changed, 431 insertions(+)
> create mode 100644 drivers/vfio/vmbus/Kconfig
> create mode 100644 drivers/vfio/vmbus/vfio_vmbus.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 55199ef7fa74..6f61fb351a5d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -7574,6 +7574,7 @@ F: drivers/scsi/storvsc_drv.c
> F: drivers/uio/uio_hv_generic.c
> F: drivers/video/fbdev/hyperv_fb.c
> F: drivers/iommu/hyperv-iommu.c
> +F: drivers/vfio/vmbus/
> F: net/vmw_vsock/hyperv_transport.c
> F: include/clocksource/hyperv_timer.h
> F: include/linux/hyperv.h
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index fd17db9b432f..f4e075fcedbe 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -47,4 +47,5 @@ menuconfig VFIO_NOIOMMU
> source "drivers/vfio/pci/Kconfig"
> source "drivers/vfio/platform/Kconfig"
> source "drivers/vfio/mdev/Kconfig"
> +source "drivers/vfio/vmbus/Kconfig"
> source "virt/lib/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index de67c4725cce..acef916cba7f 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -9,3 +9,4 @@ obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
> obj-$(CONFIG_VFIO_PCI) += pci/
> obj-$(CONFIG_VFIO_PLATFORM) += platform/
> obj-$(CONFIG_VFIO_MDEV) += mdev/
> +obj-$(CONFIG_VFIO_VMBUS) += vmbus/
> diff --git a/drivers/vfio/vmbus/Kconfig b/drivers/vfio/vmbus/Kconfig
> new file mode 100644
> index 000000000000..27a85daae55a
> --- /dev/null
> +++ b/drivers/vfio/vmbus/Kconfig
> @@ -0,0 +1,9 @@
> +config VFIO_VMBUS
> + tristate "VFIO support for vmbus devices"
> + depends on VFIO && HYPERV
> + help
> + Support for the VMBUS VFIO bus driver. Expose VMBUS
> + resources to user space drivers(e.g, DPDK and SPDK).
> +
> + If you don't know what to do here, say N.
> +

Let's use consistent capitalization of "VMBus". So:

config VFIO_VMBUS
tristate "VFIO support for Hyper-V VMBus devices"
depends on VFIO && HYPERV
help
Support for the VMBus VFIO driver, which exposes VMBus
resources to user space drivers (e.g., DPDK and SPDK).

If you don't know what to do here, say N.


> diff --git a/drivers/vfio/vmbus/vfio_vmbus.c b/drivers/vfio/vmbus/vfio_vmbus.c
> new file mode 100644
> index 000000000000..25d9cafa2c0a
> --- /dev/null
> +++ b/drivers/vfio/vmbus/vfio_vmbus.c
> @@ -0,0 +1,407 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * VFIO VMBUS driver.
> + *
> + * Copyright (C) 2019, Microsoft, Inc.
> + *
> + * Author : Lan Tianyu <Tianyu.Lan@xxxxxxxxxxxxx>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/vfio.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/hyperv.h>
> +
> +#include "../../hv/hyperv_vmbus.h"
> +
> +
> +#define DRIVER_VERSION "0.0.1"
> +#define DRIVER_AUTHOR "Tianyu Lan <Tianyu.Lan@xxxxxxxxxxxxx>"
> +#define DRIVER_DESC "VFIO driver for VMBus devices"
> +
> +#define HV_RING_SIZE (512 * HV_HYP_PAGE_SIZE) /* 2M Ring size */
> +#define SEND_BUFFER_SIZE (16 * 1024 * 1024)
> +#define RECV_BUFFER_SIZE (31 * 1024 * 1024)
> +
> +struct vmbus_mem {
> + phys_addr_t addr;
> + resource_size_t size;
> +};
> +
> +struct vfio_vmbus_device {
> + struct hv_device *hdev;
> + atomic_t refcnt;
> + struct vmbus_mem mem[VFIO_VMBUS_MAX_MAP_NUM];
> +
> + void *recv_buf;
> + void *send_buf;
> +};
> +
> +static void vfio_vmbus_channel_cb(void *context)
> +{
> + struct vmbus_channel *chan = context;
> +
> + /* Disable interrupts on channel */
> + chan->inbound.ring_buffer->interrupt_mask = 1;
> +
> + /* Issue a full memory barrier after masking interrupt */
> + virt_mb();
> +}
> +
> +static int vfio_vmbus_ring_mmap(struct file *filp, struct kobject *kobj,
> + struct bin_attribute *attr,
> + struct vm_area_struct *vma)
> +{
> + struct vmbus_channel *channel
> + = container_of(kobj, struct vmbus_channel, kobj);
> + void *ring_buffer = page_address(channel->ringbuffer_page);
> +
> + if (channel->state != CHANNEL_OPENED_STATE)
> + return -ENODEV;
> +
> + return vm_iomap_memory(vma, virt_to_phys(ring_buffer),
> + channel->ringbuffer_pagecount << PAGE_SHIFT);

Use HV_HYP_PAGE_SHIFT since ringbuffer_pagecount is in units of the
Hyper-V page size, which may be different from the guest PAGE_SIZE.

> +}
> +
> +static const struct bin_attribute ring_buffer_bin_attr = {
> + .attr = {
> + .name = "ring",
> + .mode = 0600,
> + },
> + .size = 2 * HV_RING_SIZE,
> + .mmap = vfio_vmbus_ring_mmap,
> +};
> +

[snip]

> +static int vfio_vmbus_probe(struct hv_device *dev,
> + const struct hv_vmbus_device_id *dev_id)
> +{
> + struct vmbus_channel *channel = dev->channel;
> + struct vfio_vmbus_device *vdev;
> + struct iommu_group *group;
> + u32 gpadl;
> + void *ring_buffer;
> + int ret;
> +
> + group = vfio_iommu_group_get(&dev->device);
> + if (!group)
> + return -EINVAL;
> +
> + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
> + if (!vdev) {
> + vfio_iommu_group_put(group, &dev->device);
> + return -ENOMEM;
> + }
> +
> + ret = vfio_add_group_dev(&dev->device, &vfio_vmbus_ops, vdev);
> + if (ret)
> + goto free_vdev;
> +
> + vdev->hdev = dev;
> + ret = vmbus_alloc_ring(channel, HV_RING_SIZE, HV_RING_SIZE);
> + if (ret)
> + goto del_group_dev;
> +
> + set_channel_read_mode(channel, HV_CALL_ISR);
> +
> + ring_buffer = page_address(channel->ringbuffer_page);
> + vdev->mem[VFIO_VMBUS_TXRX_RING_MAP].addr
> + = virt_to_phys(ring_buffer);
> + vdev->mem[VFIO_VMBUS_TXRX_RING_MAP].size
> + = channel->ringbuffer_pagecount << PAGE_SHIFT;

Use HV_HYP_PAGE_SHIFT per my earlier comment.

> +
> + vdev->mem[VFIO_VMBUS_INT_PAGE_MAP].addr
> + = (phys_addr_t)vmbus_connection.int_page;
> + vdev->mem[VFIO_VMBUS_INT_PAGE_MAP].size = PAGE_SIZE;

Use HV_HYP_PAGE_SIZE since the interrupt page is sized to the
Hyper-V page size, not the guest page size.

> +
> + vdev->mem[VFIO_VMBUS_MON_PAGE_MAP].addr
> + = (phys_addr_t)vmbus_connection.monitor_pages[1];
> + vdev->mem[VFIO_VMBUS_MON_PAGE_MAP].size = PAGE_SIZE;

Same here for the monitor page.

> +
> + vdev->recv_buf = vzalloc(RECV_BUFFER_SIZE);
> + if (vdev->recv_buf == NULL) {
> + ret = -ENOMEM;
> + goto free_ring;
> + }
> +
> + ret = vmbus_establish_gpadl(channel, vdev->recv_buf,
> + RECV_BUFFER_SIZE, &gpadl);
> + if (ret)
> + goto free_recv_buf;
> +
> + vdev->mem[VFIO_VMBUS_RECV_BUF_MAP].addr
> + = (phys_addr_t)vdev->recv_buf;
> + vdev->mem[VFIO_VMBUS_RECV_BUF_MAP].size = RECV_BUFFER_SIZE;
> +
> + /* Expose Recv GPADL via region info. */
> + vdev->mem[VFIO_VMBUS_RECV_GPADL].addr = gpadl;
> +
> + vdev->send_buf = vzalloc(SEND_BUFFER_SIZE);
> + if (vdev->send_buf == NULL) {
> + ret = -ENOMEM;
> + goto free_recv_gpadl;
> + }
> +
> + ret = vmbus_establish_gpadl(channel, vdev->send_buf,
> + SEND_BUFFER_SIZE, &gpadl);
> + if (ret)
> + goto free_send_buf;
> +
> + vdev->mem[VFIO_VMBUS_SEND_BUF_MAP].addr
> + = (phys_addr_t)vdev->send_buf;
> + vdev->mem[VFIO_VMBUS_SEND_BUF_MAP].size = SEND_BUFFER_SIZE;
> +
> + /* Expose Send GPADL via region info. */
> + vdev->mem[VFIO_VMBUS_SEND_GPADL].addr = gpadl;
> +
> + ret = sysfs_create_bin_file(&channel->kobj, &ring_buffer_bin_attr);
> + if (ret)
> + dev_notice(&dev->device,
> + "sysfs create ring bin file failed; %d\n", ret);
> +
> + return 0;
> +
> + free_send_buf:
> + vfree(vdev->send_buf);
> + free_recv_gpadl:
> + vmbus_teardown_gpadl(channel, vdev->mem[VFIO_VMBUS_RECV_GPADL].addr);
> + free_recv_buf:
> + vfree(vdev->recv_buf);
> + free_ring:
> + vmbus_free_ring(channel);
> + del_group_dev:
> + vfio_del_group_dev(&dev->device);
> + free_vdev:
> + vfio_iommu_group_put(group, &dev->device);
> + kfree(vdev);
> + return -EINVAL;
> +}
> +

[snip]

> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 9e843a147ead..611baf7a30e4 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -201,6 +201,7 @@ struct vfio_device_info {
> #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
> #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */
> #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */
> +#define VFIO_DEVICE_FLAGS_VMBUS (1 << 6) /* vfio-vmbus device */
> __u32 num_regions; /* Max region index + 1 */
> __u32 num_irqs; /* Max IRQ index + 1 */
> };
> @@ -564,6 +565,17 @@ enum {
> VFIO_PCI_NUM_IRQS
> };
>
> +enum {
> + VFIO_VMBUS_TXRX_RING_MAP = 0,
> + VFIO_VMBUS_INT_PAGE_MAP,
> + VFIO_VMBUS_MON_PAGE_MAP,
> + VFIO_VMBUS_RECV_BUF_MAP,
> + VFIO_VMBUS_SEND_BUF_MAP,
> + VFIO_VMBUS_RECV_GPADL,
> + VFIO_VMBUS_SEND_GPADL,
> + VFIO_VMBUS_MAX_MAP_NUM,

While the code that uses VFIO_VMBUS_MAX_MAP_NUM appears
correct, the "MAX_MAP_NUM" in the symbol name tends to
imply that this is an index value. But it's not the max index
value -- it's actually a "count" or "size". I think it would be
clearer as VFIO_VMBUS_MAP_COUNT or
VFIO_VMBUS_MAP_SIZE.

> +};
> +
> /*
> * The vfio-ccw bus driver makes use of the following fixed region and
> * IRQ index mapping. Unimplemented regions return a size of zero.
> --
> 2.14.5