[PATCH 4/9] mlx5-vfio-pci: add new vfio_pci driver for mlx5 devices
From: Max Gurtovoy
Date: Mon Feb 01 2021 - 11:30:30 EST
This driver will register to PCI bus and Auxiliary bus. In case the
probe of both devices will succeed, we'll have a vendor specific VFIO
PCI device. mlx5_vfio_pci use vfio_pci_core to register and create a
VFIO device and use auxiliary_device to get the needed extension from
the vendor device driver. If one of the probe() functions will fail, the
VFIO char device will not be created. For now, only register and bind
the auxiliary_device to the pci_device in case we have a match between
the auxiliary_device id to the pci_device BDF. Later, vendor specific
features such as live migration will be added and will be available to
the virtualization software.
Note: Although we've created the mlx5-vfio-pci.ko, the binding to
vfio-pci.ko will still work as before. It's fully backward compatible.
Of course, the extended vendor functionality will not exist in case one
will bind the device to the generic vfio_pci.ko.
Signed-off-by: Max Gurtovoy <mgurtovoy@xxxxxxxxxx>
---
drivers/vfio/pci/Kconfig | 10 ++
drivers/vfio/pci/Makefile | 3 +
drivers/vfio/pci/mlx5_vfio_pci.c | 253 +++++++++++++++++++++++++++++++
include/linux/mlx5/vfio_pci.h | 36 +++++
4 files changed, 302 insertions(+)
create mode 100644 drivers/vfio/pci/mlx5_vfio_pci.c
create mode 100644 include/linux/mlx5/vfio_pci.h
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index b958a48f63a0..dcb164d7d641 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -65,3 +65,13 @@ config VFIO_PCI_ZDEV
for zPCI devices passed through via VFIO on s390.
Say Y here.
+
+config MLX5_VFIO_PCI
+ tristate "VFIO support for MLX5 PCI devices"
+ depends on VFIO_PCI_CORE && MLX5_CORE
+ select AUXILIARY_BUS
+ help
+ This provides a generic PCI support for MLX5 devices using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 3f2a27e222cd..9f67edca31c5 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
@@ -9,3 +10,5 @@ vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
vfio-pci-y := vfio_pci.o
+
+mlx5-vfio-pci-y := mlx5_vfio_pci.o
diff --git a/drivers/vfio/pci/mlx5_vfio_pci.c b/drivers/vfio/pci/mlx5_vfio_pci.c
new file mode 100644
index 000000000000..4e6b256c74bf
--- /dev/null
+++ b/drivers/vfio/pci/mlx5_vfio_pci.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ * Author: Max Gurtovoy <mgurtovoy@xxxxxxxxxx>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/mlx5/vfio_pci.h>
+
+#include "vfio_pci_core.h"
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "Max Gurtovoy <mgurtovoy@xxxxxxxxxx>"
+#define DRIVER_DESC "MLX5 VFIO PCI - User Level meta-driver for NVIDIA MLX5 device family"
+
+/* 16k migration data size */
+#define MLX5_MIGRATION_REGION_DATA_SIZE SZ_16K
+/* Data section offset from migration region */
+#define MLX5_MIGRATION_REGION_DATA_OFFSET (sizeof(struct vfio_device_migration_info))
+
+struct mlx5_vfio_pci_migration_info {
+ struct vfio_device_migration_info mig;
+ char data[MLX5_MIGRATION_REGION_DATA_SIZE];
+};
+
+static LIST_HEAD(aux_devs_list);
+static DEFINE_MUTEX(aux_devs_lock);
+
+static struct mlx5_vfio_pci_adev *mlx5_vfio_pci_find_adev(struct pci_dev *pdev)
+{
+ struct mlx5_vfio_pci_adev *mvadev, *found = NULL;
+
+ mutex_lock(&aux_devs_lock);
+ list_for_each_entry(mvadev, &aux_devs_list, entry) {
+ if (mvadev->madev.adev.id == pci_dev_id(pdev)) {
+ found = mvadev;
+ break;
+ }
+ }
+ mutex_unlock(&aux_devs_lock);
+
+ return found;
+}
+
+static int mlx5_vfio_pci_aux_probe(struct auxiliary_device *adev,
+ const struct auxiliary_device_id *id)
+{
+ struct mlx5_vfio_pci_adev *mvadev;
+
+ mvadev = adev_to_mvadev(adev);
+
+ pr_info("%s aux probing bdf %02x:%02x.%d mdev is %s\n",
+ adev->name,
+ PCI_BUS_NUM(adev->id & 0xffff),
+ PCI_SLOT(adev->id & 0xff),
+ PCI_FUNC(adev->id & 0xff), dev_name(mvadev->madev.mdev->device));
+
+ mutex_lock(&aux_devs_lock);
+ list_add(&mvadev->entry, &aux_devs_list);
+ mutex_unlock(&aux_devs_lock);
+
+ return 0;
+}
+
+static void mlx5_vfio_pci_aux_remove(struct auxiliary_device *adev)
+{
+ struct mlx5_vfio_pci_adev *mvadev = adev_to_mvadev(adev);
+ struct vfio_pci_core_device *vpdev = dev_get_drvdata(&adev->dev);
+
+ /* TODO: is this the right thing to do ? maybe FLR ? */
+ if (vpdev)
+ pci_reset_function(vpdev->pdev);
+
+ mutex_lock(&aux_devs_lock);
+ list_del(&mvadev->entry);
+ mutex_unlock(&aux_devs_lock);
+}
+
+static const struct auxiliary_device_id mlx5_vfio_pci_aux_id_table[] = {
+ { .name = MLX5_ADEV_NAME ".vfio_pci", },
+ {},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mlx5_vfio_pci_aux_id_table);
+
+static struct auxiliary_driver mlx5_vfio_pci_aux_driver = {
+ .name = "vfio_pci_ex",
+ .probe = mlx5_vfio_pci_aux_probe,
+ .remove = mlx5_vfio_pci_aux_remove,
+ .id_table = mlx5_vfio_pci_aux_id_table,
+};
+
+static void mlx5_vfio_pci_mig_release(struct vfio_pci_core_device *vpdev,
+ struct vfio_pci_region *region)
+{
+ kfree(region->data);
+}
+
+static size_t mlx5_vfio_pci_mig_rw(struct vfio_pci_core_device *vpdev,
+ char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+ /* TODO: add all migration logic here */
+
+ return -EINVAL;
+}
+
+static struct vfio_pci_regops migraion_ops = {
+ .rw = mlx5_vfio_pci_mig_rw,
+ .release = mlx5_vfio_pci_mig_release,
+};
+
+static int mlx5_vfio_pci_op_init(struct vfio_pci_core_device *vpdev)
+{
+ struct mlx5_vfio_pci_migration_info *vmig;
+ int ret;
+
+ vmig = kzalloc(sizeof(*vmig), GFP_KERNEL);
+ if (!vmig)
+ return -ENOMEM;
+
+ ret = vfio_pci_register_dev_region(vpdev,
+ VFIO_REGION_TYPE_MIGRATION,
+ VFIO_REGION_SUBTYPE_MIGRATION,
+ &migraion_ops, sizeof(*vmig),
+ VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE, vmig);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ kfree(vmig);
+ return ret;
+}
+
+static const struct vfio_pci_device_ops mlx5_vfio_pci_ops = {
+ .name = "mlx5-vfio-pci",
+ .module = THIS_MODULE,
+ .init = mlx5_vfio_pci_op_init,
+};
+
+static int mlx5_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct vfio_pci_core_device *vpdev;
+ struct mlx5_vfio_pci_adev *mvadev;
+
+ mvadev = mlx5_vfio_pci_find_adev(pdev);
+ if (!mvadev) {
+ pr_err("failed to find aux device for %s\n",
+ dev_name(&pdev->dev));
+ return -ENODEV;
+ }
+
+ vpdev = vfio_create_pci_device(pdev, &mlx5_vfio_pci_ops, mvadev);
+ if (IS_ERR(vpdev))
+ return PTR_ERR(vpdev);
+
+ dev_set_drvdata(&mvadev->madev.adev.dev, vpdev);
+ return 0;
+}
+
+static void mlx5_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct mlx5_vfio_pci_adev *mvadev;
+
+ mvadev = mlx5_vfio_pci_find_adev(pdev);
+ if (mvadev)
+ dev_set_drvdata(&mvadev->madev.adev.dev, NULL);
+
+ vfio_destroy_pci_device(pdev);
+}
+
+#ifdef CONFIG_PCI_IOV
+static int mlx5_vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+ might_sleep();
+
+ /* DO vendor specific stuff here */
+
+ return vfio_pci_core_sriov_configure(pdev, nr_virtfn);
+}
+#endif
+
+static const struct pci_error_handlers mlx5_vfio_err_handlers = {
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static const struct pci_device_id mlx5_vfio_pci_table[] = {
+ { PCI_VDEVICE(MELLANOX, 0x6001) }, /* NVMe SNAP controllers */
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042,
+ PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */
+ { 0, }
+};
+
+static struct pci_driver mlx5_vfio_pci_driver = {
+ .name = "mlx5-vfio-pci",
+ .id_table = mlx5_vfio_pci_table,
+ .probe = mlx5_vfio_pci_probe,
+ .remove = mlx5_vfio_pci_remove,
+#ifdef CONFIG_PCI_IOV
+ .sriov_configure = mlx5_vfio_pci_sriov_configure,
+#endif
+ .err_handler = &mlx5_vfio_err_handlers,
+};
+
+static void __exit mlx5_vfio_pci_cleanup(void)
+{
+ auxiliary_driver_unregister(&mlx5_vfio_pci_aux_driver);
+ pci_unregister_driver(&mlx5_vfio_pci_driver);
+}
+
+static int __init mlx5_vfio_pci_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&mlx5_vfio_pci_driver);
+ if (ret)
+ return ret;
+
+ ret = auxiliary_driver_register(&mlx5_vfio_pci_aux_driver);
+ if (ret)
+ goto out_unregister;
+
+ return 0;
+
+out_unregister:
+ pci_unregister_driver(&mlx5_vfio_pci_driver);
+ return ret;
+}
+
+module_init(mlx5_vfio_pci_init);
+module_exit(mlx5_vfio_pci_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/mlx5/vfio_pci.h b/include/linux/mlx5/vfio_pci.h
new file mode 100644
index 000000000000..c1e7b4d6da30
--- /dev/null
+++ b/include/linux/mlx5/vfio_pci.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2020 NVIDIA Corporation
+ */
+
+#ifndef _VFIO_PCI_H
+#define _VFIO_PCI_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/driver.h>
+
+struct mlx5_vfio_pci_adev {
+ struct mlx5_adev madev;
+
+ /* These fields should not be used outside mlx5_vfio_pci.ko */
+ struct list_head entry;
+};
+
+static inline struct mlx5_vfio_pci_adev*
+madev_to_mvadev(struct mlx5_adev *madev)
+{
+ return container_of(madev, struct mlx5_vfio_pci_adev, madev);
+}
+
+static inline struct mlx5_vfio_pci_adev*
+adev_to_mvadev(struct auxiliary_device *adev)
+{
+ struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
+
+ return madev_to_mvadev(madev);
+}
+
+#endif
--
2.25.4