Re: [RFC PATCH 3/5] nvme-vfio: enable the function of VFIO live migration.

From: Rao, Lei
Date: Thu Feb 09 2023 - 04:09:48 EST




On 1/19/2023 6:21 PM, Max Gurtovoy wrote:
Hi Leo,

On 06/12/2022 7:58, Lei Rao wrote:
Implement specific VFIO live migration operations for NVMe devices.

Signed-off-by: Lei Rao <lei.rao@xxxxxxxxx>
Signed-off-by: Yadong Li <yadong.li@xxxxxxxxx>
Signed-off-by: Chaitanya Kulkarni <kch@xxxxxxxxxx>
Reviewed-by: Eddie Dong <eddie.dong@xxxxxxxxx>
Reviewed-by: Hang Yuan <hang.yuan@xxxxxxxxx>
---
drivers/vfio/pci/nvme/Kconfig |   5 +-
drivers/vfio/pci/nvme/nvme.c  | 543 ++++++++++++++++++++++++++++++++--
drivers/vfio/pci/nvme/nvme.h  | 111 +++++++
3 files changed, 637 insertions(+), 22 deletions(-)
create mode 100644 drivers/vfio/pci/nvme/nvme.h

diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig
index c281fe154007..12e0eaba0de1 100644
--- a/drivers/vfio/pci/nvme/Kconfig
+++ b/drivers/vfio/pci/nvme/Kconfig
@@ -1,9 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-only
config NVME_VFIO_PCI
    tristate "VFIO support for NVMe PCI devices"
+    depends on NVME_CORE
    depends on VFIO_PCI_CORE
    help
-      This provides generic VFIO PCI support for NVMe device
-      using the VFIO framework.
+      This provides migration support for NVMe devices using the
+      VFIO framework.

      If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c
index f1386d8a9287..698e470a4e53 100644
--- a/drivers/vfio/pci/nvme/nvme.c
+++ b/drivers/vfio/pci/nvme/nvme.c
@@ -13,29 +13,503 @@
#include <linux/types.h>
#include <linux/vfio.h>
#include <linux/anon_inodes.h>
-#include <linux/kernel.h>
-#include <linux/vfio_pci_core.h>
+
+#include "nvme.h"
+
+#define MAX_MIGRATION_SIZE (256 * 1024)
+
+static int nvmevf_cmd_suspend_device(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+    struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+    struct nvme_live_mig_command c = { };
+    int ret;
+
+    c.suspend.opcode = nvme_admin_live_mig_suspend;
+    c.suspend.vf_index = nvmevf_dev->vf_id;
+
+    ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL, NULL, 0);
+    if (ret) {
+        dev_warn(&dev->dev, "Suspend virtual function failed (ret=0x%x)\n", ret);
+        return ret;
+    }
+    return 0;
+}
+
+static int nvmevf_cmd_resume_device(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+    struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+    struct nvme_live_mig_command c = { };
+    int ret;
+
+    c.resume.opcode = nvme_admin_live_mig_resume;
+    c.resume.vf_index = nvmevf_dev->vf_id;
+
+    ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL, NULL, 0);
+    if (ret) {
+        dev_warn(&dev->dev, "Resume virtual function failed (ret=0x%x)\n", ret);
+        return ret;
+    }
+    return 0;
+}
+
+static int nvmevf_cmd_query_data_size(struct nvmevf_pci_core_device *nvmevf_dev,
+                      size_t *state_size)
+{
+    struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+    struct nvme_live_mig_command c = { };
+    size_t result;
+    int ret;
+
+    c.query.opcode = nvme_admin_live_mig_query_data_size;
+    c.query.vf_index = nvmevf_dev->vf_id;
+
+    ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, &result, NULL, 0);
+    if (ret) {
+        dev_warn(&dev->dev, "Query the states size failed (ret=0x%x)\n", ret);
+        *state_size = 0;
+        return ret;
+    }
+    *state_size = result;
+    return 0;
+}
+
+static int nvmevf_cmd_save_data(struct nvmevf_pci_core_device *nvmevf_dev,
+                    void *buffer, size_t buffer_len)
+{
+    struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+    struct nvme_live_mig_command c = { };
+    int ret;
+
+    c.save.opcode = nvme_admin_live_mig_save_data;
+    c.save.vf_index = nvmevf_dev->vf_id;
+
+    ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL, buffer, buffer_len);
+    if (ret) {
+        dev_warn(&dev->dev, "Save the device states failed (ret=0x%x)\n", ret);
+        return ret;
+    }
+    return 0;
+}
+
+static int nvmevf_cmd_load_data(struct nvmevf_pci_core_device *nvmevf_dev,
+                    struct nvmevf_migration_file *migf)
+{
+    struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+    struct nvme_live_mig_command c = { };
+    int ret;
+
+    c.load.opcode = nvme_admin_live_mig_load_data;
+    c.load.vf_index = nvmevf_dev->vf_id;
+    c.load.size = migf->total_length;
+
+    ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL,
+            migf->vf_data, migf->total_length);
+    if (ret) {
+        dev_warn(&dev->dev, "Load the device states failed (ret=0x%x)\n", ret);
+        return ret;
+    }
+    return 0;
+}
+
+static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev)
+{
+    struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+    return container_of(core_device, struct nvmevf_pci_core_device, core_device);
+}
+
+static void nvmevf_disable_fd(struct nvmevf_migration_file *migf)
+{
+    mutex_lock(&migf->lock);
+
+    /* release the device states buffer */
+    kvfree(migf->vf_data);
+    migf->vf_data = NULL;
+    migf->disabled = true;
+    migf->total_length = 0;
+    migf->filp->f_pos = 0;
+    mutex_unlock(&migf->lock);
+}
+
+static int nvmevf_release_file(struct inode *inode, struct file *filp)
+{
+    struct nvmevf_migration_file *migf = filp->private_data;
+
+    nvmevf_disable_fd(migf);
+    mutex_destroy(&migf->lock);
+    kfree(migf);
+    return 0;
+}
+
+static ssize_t nvmevf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+    struct nvmevf_migration_file *migf = filp->private_data;
+    ssize_t done = 0;
+    int ret;
+
+    if (pos)
+        return -ESPIPE;
+    pos = &filp->f_pos;
+
+    mutex_lock(&migf->lock);
+    if (*pos > migf->total_length) {
+        done = -EINVAL;
+        goto out_unlock;
+    }
+
+    if (migf->disabled) {
+        done = -EINVAL;
+        goto out_unlock;
+    }
+
+    len = min_t(size_t, migf->total_length - *pos, len);
+    if (len) {
+        ret = copy_to_user(buf, migf->vf_data + *pos, len);
+        if (ret) {
+            done = -EFAULT;
+            goto out_unlock;
+        }
+        *pos += len;
+        done = len;
+    }
+
+out_unlock:
+    mutex_unlock(&migf->lock);
+    return done;
+}
+
+static const struct file_operations nvmevf_save_fops = {
+    .owner = THIS_MODULE,
+    .read = nvmevf_save_read,
+    .release = nvmevf_release_file,
+    .llseek = no_llseek,
+};
+
+static ssize_t nvmevf_resume_write(struct file *filp, const char __user *buf,
+                       size_t len, loff_t *pos)
+{
+    struct nvmevf_migration_file *migf = filp->private_data;
+    loff_t requested_length;
+    ssize_t done = 0;
+    int ret;
+
+    if (pos)
+        return -ESPIPE;
+    pos = &filp->f_pos;
+
+    if (*pos < 0 || check_add_overflow((loff_t)len, *pos, &requested_length))
+        return -EINVAL;
+
+    if (requested_length > MAX_MIGRATION_SIZE)
+        return -ENOMEM;
+    mutex_lock(&migf->lock);
+    if (migf->disabled) {
+        done = -ENODEV;
+        goto out_unlock;
+    }
+
+    ret = copy_from_user(migf->vf_data + *pos, buf, len);
+    if (ret) {
+        done = -EFAULT;
+        goto out_unlock;
+    }
+    *pos += len;
+    done = len;
+    migf->total_length += len;
+
+out_unlock:
+    mutex_unlock(&migf->lock);
+    return done;
+}
+
+static const struct file_operations nvmevf_resume_fops = {
+    .owner = THIS_MODULE,
+    .write = nvmevf_resume_write,
+    .release = nvmevf_release_file,
+    .llseek = no_llseek,
+};
+
+static void nvmevf_disable_fds(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+    if (nvmevf_dev->resuming_migf) {
+        nvmevf_disable_fd(nvmevf_dev->resuming_migf);
+        fput(nvmevf_dev->resuming_migf->filp);
+        nvmevf_dev->resuming_migf = NULL;
+    }
+
+    if (nvmevf_dev->saving_migf) {
+        nvmevf_disable_fd(nvmevf_dev->saving_migf);
+        fput(nvmevf_dev->saving_migf->filp);
+        nvmevf_dev->saving_migf = NULL;
+    }
+}
+
+static struct nvmevf_migration_file *
+nvmevf_pci_resume_device_data(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+    struct nvmevf_migration_file *migf;
+    int ret;
+
+    migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+    if (!migf)
+        return ERR_PTR(-ENOMEM);
+
+    migf->filp = anon_inode_getfile("nvmevf_mig", &nvmevf_resume_fops, migf,
+                    O_WRONLY);
+    if (IS_ERR(migf->filp)) {
+        int err = PTR_ERR(migf->filp);
+
+        kfree(migf);
+        return ERR_PTR(err);
+    }
+    stream_open(migf->filp->f_inode, migf->filp);
+    mutex_init(&migf->lock);
+
+    /* Allocate buffer to load the device states and the max states is 256K */
+    migf->vf_data = kvzalloc(MAX_MIGRATION_SIZE, GFP_KERNEL);
+    if (!migf->vf_data) {
+        ret = -ENOMEM;
+        goto out_free;
+    }
+
+    return migf;
+
+out_free:
+    fput(migf->filp);
+    return ERR_PTR(ret);
+}
+
+static struct nvmevf_migration_file *
+nvmevf_pci_save_device_data(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+    struct nvmevf_migration_file *migf;
+    int ret;
+
+    migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+    if (!migf)
+        return ERR_PTR(-ENOMEM);
+
+    migf->filp = anon_inode_getfile("nvmevf_mig", &nvmevf_save_fops, migf,
+                    O_RDONLY);
+    if (IS_ERR(migf->filp)) {
+        int err = PTR_ERR(migf->filp);
+
+        kfree(migf);
+        return ERR_PTR(err);
+    }
+
+    stream_open(migf->filp->f_inode, migf->filp);
+    mutex_init(&migf->lock);
+
+    ret = nvmevf_cmd_query_data_size(nvmevf_dev, &migf->total_length);
+    if (ret)
+        goto out_free;
+    /* Allocate buffer and save the device states*/
+    migf->vf_data = kvzalloc(migf->total_length, GFP_KERNEL);
+    if (!migf->vf_data) {
+        ret = -ENOMEM;
+        goto out_free;
+    }
+
+    ret = nvmevf_cmd_save_data(nvmevf_dev, migf->vf_data, migf->total_length);
+    if (ret)
+        goto out_free;
+
+    return migf;
+out_free:
+    fput(migf->filp);
+    return ERR_PTR(ret);
+}
+
+static struct file *
+nvmevf_pci_step_device_state_locked(struct nvmevf_pci_core_device *nvmevf_dev, u32 new)
+{
+    u32 cur = nvmevf_dev->mig_state;
+    int ret;
+
+    if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) {
+        ret = nvmevf_cmd_suspend_device(nvmevf_dev);
+        if (ret)
+            return ERR_PTR(ret);
+        return NULL;
+    }
+
+    if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+        struct nvmevf_migration_file *migf;
+
+        migf = nvmevf_pci_save_device_data(nvmevf_dev);
+        if (IS_ERR(migf))
+            return ERR_CAST(migf);
+        get_file(migf->filp);
+        nvmevf_dev->saving_migf = migf;
+        return migf->filp;
+    }
+
+    if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+        nvmevf_disable_fds(nvmevf_dev);
+        return NULL;
+    }
+
+    if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+        struct nvmevf_migration_file *migf;
+
+        migf = nvmevf_pci_resume_device_data(nvmevf_dev);
+        if (IS_ERR(migf))
+            return ERR_CAST(migf);
+        get_file(migf->filp);
+        nvmevf_dev->resuming_migf = migf;
+        return migf->filp;
+    }
+
+    if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+        ret = nvmevf_cmd_load_data(nvmevf_dev, nvmevf_dev->resuming_migf);
+        if (ret)
+            return ERR_PTR(ret);
+        nvmevf_disable_fds(nvmevf_dev);
+        return NULL;
+    }
+
+    if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) {
+        nvmevf_cmd_resume_device(nvmevf_dev);
+        return NULL;
+    }
+
+    /* vfio_mig_get_next_state() does not use arcs other than the above */
+    WARN_ON(true);
+    return ERR_PTR(-EINVAL);
+}
+
+static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+again:
+    spin_lock(&nvmevf_dev->reset_lock);
+    if (nvmevf_dev->deferred_reset) {
+        nvmevf_dev->deferred_reset = false;
+        spin_unlock(&nvmevf_dev->reset_lock);
+        nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+        nvmevf_disable_fds(nvmevf_dev);
+        goto again;
+    }
+    mutex_unlock(&nvmevf_dev->state_mutex);
+    spin_unlock(&nvmevf_dev->reset_lock);
+}
+
+static struct file *
+nvmevf_pci_set_device_state(struct vfio_device *vdev, enum vfio_device_mig_state new_state)
+{
+    struct nvmevf_pci_core_device *nvmevf_dev = container_of(vdev,
+            struct nvmevf_pci_core_device, core_device.vdev);
+    enum vfio_device_mig_state next_state;
+    struct file *res = NULL;
+    int ret;
+
+    mutex_lock(&nvmevf_dev->state_mutex);
+    while (new_state != nvmevf_dev->mig_state) {
+        ret = vfio_mig_get_next_state(vdev, nvmevf_dev->mig_state, new_state, &next_state);
+        if (ret) {
+            res = ERR_PTR(-EINVAL);
+            break;
+        }
+
+        res = nvmevf_pci_step_device_state_locked(nvmevf_dev, next_state);
+        if (IS_ERR(res))
+            break;
+        nvmevf_dev->mig_state = next_state;
+        if (WARN_ON(res && new_state != nvmevf_dev->mig_state)) {
+            fput(res);
+            res = ERR_PTR(-EINVAL);
+            break;
+        }
+    }
+    nvmevf_state_mutex_unlock(nvmevf_dev);
+    return res;
+}
+
+static int nvmevf_pci_get_device_state(struct vfio_device *vdev,
+                       enum vfio_device_mig_state *curr_state)
+{
+    struct nvmevf_pci_core_device *nvmevf_dev = container_of(
+            vdev, struct nvmevf_pci_core_device, core_device.vdev);
+
+    mutex_lock(&nvmevf_dev->state_mutex);
+    *curr_state = nvmevf_dev->mig_state;
+    nvmevf_state_mutex_unlock(nvmevf_dev);
+    return 0;
+}

static int nvmevf_pci_open_device(struct vfio_device *core_vdev)
{
-    struct vfio_pci_core_device *vdev =
-        container_of(core_vdev, struct vfio_pci_core_device, vdev);
+    struct nvmevf_pci_core_device *nvmevf_dev = container_of(
+            core_vdev, struct nvmevf_pci_core_device, core_device.vdev);
+    struct vfio_pci_core_device *vdev = &nvmevf_dev->core_device;
    int ret;

    ret = vfio_pci_core_enable(vdev);
    if (ret)
        return ret;

+    if (nvmevf_dev->migrate_cap)
+        nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
    vfio_pci_core_finish_enable(vdev);
    return 0;
}

+static void nvmevf_cmd_close_migratable(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+    if (!nvmevf_dev->migrate_cap)
+        return;
+
+    mutex_lock(&nvmevf_dev->state_mutex);
+    nvmevf_disable_fds(nvmevf_dev);
+    nvmevf_state_mutex_unlock(nvmevf_dev);
+}
+
+static void nvmevf_pci_close_device(struct vfio_device *core_vdev)
+{
+    struct nvmevf_pci_core_device *nvmevf_dev = container_of(
+            core_vdev, struct nvmevf_pci_core_device, core_device.vdev);
+
+    nvmevf_cmd_close_migratable(nvmevf_dev);
+    vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_migration_ops nvmevf_pci_mig_ops = {
+    .migration_set_state = nvmevf_pci_set_device_state,
+    .migration_get_state = nvmevf_pci_get_device_state,
+};
+
+static int nvmevf_migration_init_dev(struct vfio_device *core_vdev)
+{
+    struct nvmevf_pci_core_device *nvmevf_dev = container_of(core_vdev,
+                    struct nvmevf_pci_core_device, core_device.vdev);
+    struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
+    int vf_id;
+    int ret = -1;
+
+    if (!pdev->is_virtfn)
+        return ret;
+
+    nvmevf_dev->migrate_cap = 1;
+
+    vf_id = pci_iov_vf_id(pdev);
+    if (vf_id < 0)
+        return ret;
+    nvmevf_dev->vf_id = vf_id + 1;
+    core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY;
+
+    mutex_init(&nvmevf_dev->state_mutex);
+    spin_lock_init(&nvmevf_dev->reset_lock);
+    core_vdev->mig_ops = &nvmevf_pci_mig_ops;
+
+    return vfio_pci_core_init_dev(core_vdev);
+}
+
static const struct vfio_device_ops nvmevf_pci_ops = {
    .name = "nvme-vfio-pci",
-    .init = vfio_pci_core_init_dev,
+    .init = nvmevf_migration_init_dev,
    .release = vfio_pci_core_release_dev,
    .open_device = nvmevf_pci_open_device,
-    .close_device = vfio_pci_core_close_device,
+    .close_device = nvmevf_pci_close_device,
    .ioctl = vfio_pci_core_ioctl,
    .device_feature = vfio_pci_core_ioctl_feature,
    .read = vfio_pci_core_read,
@@ -47,32 +521,56 @@ static const struct vfio_device_ops nvmevf_pci_ops = {

static int nvmevf_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
-    struct vfio_pci_core_device *vdev;
+    struct nvmevf_pci_core_device *nvmevf_dev;
    int ret;

-    vdev = vfio_alloc_device(vfio_pci_core_device, vdev, &pdev->dev,
-                &nvmevf_pci_ops);
-    if (IS_ERR(vdev))
-        return PTR_ERR(vdev);
+    nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device, core_device.vdev,
+                    &pdev->dev, &nvmevf_pci_ops);
+    if (IS_ERR(nvmevf_dev))
+        return PTR_ERR(nvmevf_dev);

-    dev_set_drvdata(&pdev->dev, vdev);
-    ret = vfio_pci_core_register_device(vdev);
+    dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device);
+    ret = vfio_pci_core_register_device(&nvmevf_dev->core_device);
    if (ret)
        goto out_put_dev;
-
    return 0;

out_put_dev:
-    vfio_put_device(&vdev->vdev);
+    vfio_put_device(&nvmevf_dev->core_device.vdev);
    return ret;
+
}

static void nvmevf_pci_remove(struct pci_dev *pdev)
{
-    struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+    struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&nvmevf_dev->core_device);
+    vfio_put_device(&nvmevf_dev->core_device.vdev);
+}
+
+static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+    struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+    if (!nvmevf_dev->migrate_cap)
+        return;

-    vfio_pci_core_unregister_device(vdev);
-    vfio_put_device(&vdev->vdev);
+    /*
+     * As the higher VFIO layers are holding locks across reset and using
+     * those same locks with the mm_lock we need to prevent ABBA deadlock
+     * with the state_mutex and mm_lock.
+     * In case the state_mutex was taken already we defer the cleanup work
+     * to the unlock flow of the other running context.
+     */
+    spin_lock(&nvmevf_dev->reset_lock);
+    nvmevf_dev->deferred_reset = true;
+    if (!mutex_trylock(&nvmevf_dev->state_mutex)) {
+        spin_unlock(&nvmevf_dev->reset_lock);
+        return;
+    }
+    spin_unlock(&nvmevf_dev->reset_lock);
+    nvmevf_state_mutex_unlock(nvmevf_dev);
}

static const struct pci_device_id nvmevf_pci_table[] = {
@@ -83,12 +581,17 @@ static const struct pci_device_id nvmevf_pci_table[] = {

MODULE_DEVICE_TABLE(pci, nvmevf_pci_table);

+static const struct pci_error_handlers nvmevf_err_handlers = {
+    .reset_done = nvmevf_pci_aer_reset_done,
+    .error_detected = vfio_pci_core_aer_err_detected,
+};
+
static struct pci_driver nvmevf_pci_driver = {
    .name = KBUILD_MODNAME,
    .id_table = nvmevf_pci_table,
    .probe = nvmevf_pci_probe,
    .remove = nvmevf_pci_remove,
-    .err_handler = &vfio_pci_core_err_handlers,
+    .err_handler = &nvmevf_err_handlers,
    .driver_managed_dma = true,
};

@@ -96,4 +599,4 @@ module_pci_driver(nvmevf_pci_driver);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Lei Rao <lei.rao@xxxxxxxxx>");
-MODULE_DESCRIPTION("NVMe VFIO PCI - Generic VFIO PCI driver for NVMe");
+MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe");
diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h
new file mode 100644
index 000000000000..c8464554ef53
--- /dev/null
+++ b/drivers/vfio/pci/nvme/nvme.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
+ */
+
+#ifndef NVME_VFIO_PCI_H
+#define NVME_VFIO_PCI_H
+
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/nvme.h>
+
+struct nvme_live_mig_query_size {
+    __u8    opcode;
+    __u8    flags;
+    __u16    command_id;
+    __u32    rsvd1[9];
+    __u16    vf_index;
+    __u16    rsvd2;
+    __u32    rsvd3[5];
+};
+
+struct nvme_live_mig_suspend {
+    __u8    opcode;
+    __u8    flags;
+    __u16    command_id;
+    __u32    rsvd1[9];
+    __u16    vf_index;
+    __u16    rsvd2;
+    __u32    rsvd3[5];
+};
+
+struct nvme_live_mig_resume {
+    __u8    opcode;
+    __u8    flags;
+    __u16   command_id;
+    __u32   rsvd1[9];
+    __u16   vf_index;
+    __u16   rsvd2;
+    __u32   rsvd3[5];
+};
+
+struct nvme_live_mig_save_data {
+    __u8    opcode;
+    __u8    flags;
+    __u16    command_id;
+    __u32    rsvd1[5];
+    __le64    prp1;
+    __le64    prp2;
+    __u16    vf_index;
+    __u16    rsvd2;
+    __u32    rsvd3[5];
+};

Just noticed that the save_data (similar to READ operation) doesn't have size/length member.
How does it work for you ?

Sorry for late reply due to the CNY.
Before the save_data, it will send the Query command to get the data size, and then the driver
will allocate the data buffer used to receive the device state through DMA. That means both the
controller and Host driver know the device state's size.


BTW,

Are you doing iterative reads from the device to get the state ?

Yes, in theory, iterative reads are needed.

The size of the device state is related to the number of queue pairs.
For NVMe, the states of one queue pair that needs to be migrated should have only dozens of bytes.
As we know, NVMe can support a maximum number of 64K IO queues, but the actual hardware
can support far less than 64K queues. Therefore, without considering DMA dirty pages, the size
of the device state is small. Practically one read should be able to retrieve all the device states.

Actually, this is also related to maximum data transfer size (MDTS). If the device state size
exceeds MDTS, it must be read iteratively. If needed, we can add a command field to support iteration.

Lei.


+
+struct nvme_live_mig_load_data {
+    __u8    opcode;
+    __u8    flags;
+    __u16   command_id;
+    __u32   rsvd1[5];
+    __le64  prp1;
+    __le64  prp2;
+    __u16   vf_index;
+    __u16    rsvd2;
+    __u32    size;
+    __u32   rsvd3[4];
+};
+
+enum nvme_live_mig_admin_opcode {
+    nvme_admin_live_mig_query_data_size    = 0xC4,
+    nvme_admin_live_mig_suspend        = 0xC8,
+    nvme_admin_live_mig_resume        = 0xCC,
+    nvme_admin_live_mig_save_data        = 0xD2,
+    nvme_admin_live_mig_load_data        = 0xD5,
+};
+
+struct nvme_live_mig_command {
+    union {
+        struct nvme_live_mig_query_size query;
+        struct nvme_live_mig_suspend    suspend;
+        struct nvme_live_mig_resume    resume;
+        struct nvme_live_mig_save_data    save;
+        struct nvme_live_mig_load_data    load;
+    };
+};
+
+struct nvmevf_migration_file {
+    struct file *filp;
+    struct mutex lock;
+    bool disabled;
+    u8 *vf_data;
+    size_t total_length;
+};
+
+struct nvmevf_pci_core_device {
+    struct vfio_pci_core_device core_device;
+    int vf_id;
+    u8 migrate_cap:1;
+    u8 deferred_reset:1;
+    /* protect migration state */
+    struct mutex state_mutex;
+    enum vfio_device_mig_state mig_state;
+    /* protect the reset_done flow */
+    spinlock_t reset_lock;
+    struct nvmevf_migration_file *resuming_migf;
+    struct nvmevf_migration_file *saving_migf;
+};
+
+extern int nvme_submit_vf_cmd(struct pci_dev *dev, struct nvme_command *cmd,
+            size_t *result, void *buffer, unsigned int bufflen);
+
+#endif /* NVME_VFIO_PCI_H */