RE: [PATCH v2 5/7] vfio/pci: Add sriov_configure support

From: Tian, Kevin
Date: Mon Feb 24 2020 - 22:08:10 EST


> From: Alex Williamson
> Sent: Thursday, February 20, 2020 2:54 AM
>
> With the VF Token interface we can now expect that a vfio userspace
> driver must be in collaboration with the PF driver, an unwitting
> userspace driver will not be able to get past the GET_DEVICE_FD step
> in accessing the device. We can now move on to actually allowing
> SR-IOV to be enabled by vfio-pci on the PF. Support for this is not
> enabled by default in this commit, but it does provide a module option
> for this to be enabled (enable_sriov=1). Enabling VFs is rather
> straightforward, except we don't want to risk that a VF might get
> autoprobed and bound to other drivers, so a bus notifier is used to
> "capture" VFs to vfio-pci using the driver_override support. We
> assume any later action to bind the device to other drivers is
> condoned by the system admin and allow it with a log warning.
>
> vfio-pci will disable SR-IOV on a PF before releasing the device,
> allowing a VF driver to be assured other drivers cannot take over the
> PF and that any other userspace driver must know the shared VF token.
> This support also does not provide a mechanism for the PF userspace
> driver itself to manipulate SR-IOV through the vfio API. With this
> patch SR-IOV can only be enabled via the host sysfs interface and the
> PF driver user cannot create or remove VFs.

I'm not sure how many devices can be properly configured simply
with pci_enable_sriov. It is not unusual to require PF driver prepare
something before turning PCI SR-IOV capability. If you look kernel
PF drivers, there are only two using generic pci_sriov_configure_
simple (simple wrapper like pci_enable_sriov), while most others
implementing their own callback. However vfio itself has no idea
thus I'm not sure how an user knows whether using this option can
actually meet his purpose. I may miss something here, possibly
using DPDK as an example will make it clearer.

>
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
> ---
> drivers/vfio/pci/vfio_pci.c | 106 +++++++++++++++++++++++++++++++--
> --
> drivers/vfio/pci/vfio_pci_private.h | 2 +
> 2 files changed, 97 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index e4d5d26e5e71..b40ade48a844 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -54,6 +54,12 @@ module_param(disable_idle_d3, bool, S_IRUGO |
> S_IWUSR);
> MODULE_PARM_DESC(disable_idle_d3,
> "Disable using the PCI D3 low power state for idle, unused
> devices");
>
> +static bool enable_sriov;
> +#ifdef CONFIG_PCI_IOV
> +module_param(enable_sriov, bool, 0644);
> +MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV
> configuration");
> +#endif
> +
> static inline bool vfio_vga_disabled(void)
> {
> #ifdef CONFIG_VFIO_PCI_VGA
> @@ -1528,6 +1534,35 @@ static const struct vfio_device_ops vfio_pci_ops =
> {
>
> static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
> static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
> +static struct pci_driver vfio_pci_driver;
> +
> +static int vfio_pci_bus_notifier(struct notifier_block *nb,
> + unsigned long action, void *data)
> +{
> + struct vfio_pci_device *vdev = container_of(nb,
> + struct vfio_pci_device, nb);
> + struct device *dev = data;
> + struct pci_dev *pdev = to_pci_dev(dev);
> + struct pci_dev *physfn = pci_physfn(pdev);
> +
> + if (action == BUS_NOTIFY_ADD_DEVICE &&
> + pdev->is_virtfn && physfn == vdev->pdev) {
> + pci_info(vdev->pdev, "Captured SR-IOV VF %s
> driver_override\n",
> + pci_name(pdev));
> + pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
> + vfio_pci_ops.name);
> + } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
> + pdev->is_virtfn && physfn == vdev->pdev) {
> + struct pci_driver *drv = pci_dev_driver(pdev);
> +
> + if (drv && drv != &vfio_pci_driver)
> + pci_warn(vdev->pdev,
> + "VF %s bound to driver %s while PF bound to
> vfio-pci\n",
> + pci_name(pdev), drv->name);
> + }
> +
> + return 0;
> +}
>
> static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> {
> @@ -1539,12 +1574,12 @@ static int vfio_pci_probe(struct pci_dev *pdev,
> const struct pci_device_id *id)
> return -EINVAL;
>
> /*
> - * Prevent binding to PFs with VFs enabled, this too easily allows
> - * userspace instance with VFs and PFs from the same device, which
> - * cannot work. Disabling SR-IOV here would initiate removing the
> - * VFs, which would unbind the driver, which is prone to blocking
> - * if that VF is also in use by vfio-pci. Just reject these PFs
> - * and let the user sort it out.
> + * Prevent binding to PFs with VFs enabled, the VFs might be in use
> + * by the host or other users. We cannot capture the VFs if they
> + * already exist, nor can we track VF users. Disabling SR-IOV here
> + * would initiate removing the VFs, which would unbind the driver,
> + * which is prone to blocking if that VF is also in use by vfio-pci.
> + * Just reject these PFs and let the user sort it out.
> */
> if (pci_num_vf(pdev)) {
> pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
> @@ -1592,6 +1627,18 @@ static int vfio_pci_probe(struct pci_dev *pdev,
> const struct pci_device_id *id)
> kfree(vdev);
> return -ENOMEM;
> }
> +
> + vdev->nb.notifier_call = vfio_pci_bus_notifier;
> + ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
> + if (ret) {
> + kfree(vdev->vf_token);
> + vfio_pci_reflck_put(vdev->reflck);
> + vfio_del_group_dev(&pdev->dev);
> + vfio_iommu_group_put(group, &pdev->dev);
> + kfree(vdev);
> + return ret;
> + }
> +
> mutex_init(&vdev->vf_token->lock);
> uuid_gen(&vdev->vf_token->uuid);
> }
> @@ -1625,6 +1672,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
> {
> struct vfio_pci_device *vdev;
>
> + pci_disable_sriov(pdev);
> +
> vdev = vfio_del_group_dev(&pdev->dev);
> if (!vdev)
> return;
> @@ -1635,6 +1684,9 @@ static void vfio_pci_remove(struct pci_dev *pdev)
> kfree(vdev->vf_token);
> }
>
> + if (vdev->nb.notifier_call)
> + bus_unregister_notifier(&pci_bus_type, &vdev->nb);
> +
> vfio_pci_reflck_put(vdev->reflck);
>
> vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
> @@ -1683,16 +1735,48 @@ static pci_ers_result_t
> vfio_pci_aer_err_detected(struct pci_dev *pdev,
> return PCI_ERS_RESULT_CAN_RECOVER;
> }
>
> +static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
> +{
> + struct vfio_pci_device *vdev;
> + struct vfio_device *device;
> + int ret = 0;
> +
> + might_sleep();
> +
> + if (!enable_sriov)
> + return -ENOENT;
> +
> + device = vfio_device_get_from_dev(&pdev->dev);
> + if (!device)
> + return -ENODEV;
> +
> + vdev = vfio_device_data(device);
> + if (!vdev) {
> + vfio_device_put(device);
> + return -ENODEV;
> + }
> +
> + if (nr_virtfn == 0)
> + pci_disable_sriov(pdev);
> + else
> + ret = pci_enable_sriov(pdev, nr_virtfn);
> +
> + vfio_device_put(device);
> +
> + return ret < 0 ? ret : nr_virtfn;
> +}
> +
> static const struct pci_error_handlers vfio_err_handlers = {
> .error_detected = vfio_pci_aer_err_detected,
> };
>
> static struct pci_driver vfio_pci_driver = {
> - .name = "vfio-pci",
> - .id_table = NULL, /* only dynamic ids */
> - .probe = vfio_pci_probe,
> - .remove = vfio_pci_remove,
> - .err_handler = &vfio_err_handlers,
> + .name = "vfio-pci",
> + .id_table = NULL, /* only dynamic ids */
> + .probe = vfio_pci_probe,
> + .remove = vfio_pci_remove,
> + .sriov_configure = vfio_pci_sriov_configure,
> + .err_handler = &vfio_err_handlers,
> };
>
> static DEFINE_MUTEX(reflck_lock);
> diff --git a/drivers/vfio/pci/vfio_pci_private.h
> b/drivers/vfio/pci/vfio_pci_private.h
> index 76c11c915949..36ec69081ecd 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -13,6 +13,7 @@
> #include <linux/irqbypass.h>
> #include <linux/types.h>
> #include <linux/uuid.h>
> +#include <linux/notifier.h>
>
> #ifndef VFIO_PCI_PRIVATE_H
> #define VFIO_PCI_PRIVATE_H
> @@ -130,6 +131,7 @@ struct vfio_pci_device {
> struct mutex ioeventfds_lock;
> struct list_head ioeventfds_list;
> struct vfio_pci_vf_token *vf_token;
> + struct notifier_block nb;
> };
>
> #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)