Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.vdpa_state is set EF100_VDPA_STATE_STARTED during DRIVER_OK handling. See my later response for its purpose.
On Tue, Mar 7, 2023 at 7:38 PM Gautam Dawar <gautam.dawar@xxxxxxx> wrote:
vDPA config opertions to handle get/set device status and deviceIt looks to me that this duplicates with the DRIVER_OK status bit.
reset have been implemented. Also .suspend config operation is
implemented to support Live Migration.
Signed-off-by: Gautam Dawar <gautam.dawar@xxxxxxx>
---
drivers/net/ethernet/sfc/ef100_vdpa.c | 16 +-
drivers/net/ethernet/sfc/ef100_vdpa.h | 2 +
drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367 ++++++++++++++++++++--
3 files changed, 355 insertions(+), 30 deletions(-)
diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c b/drivers/net/ethernet/sfc/ef100_vdpa.c
index c66e5aef69ea..4ba57827a6cd 100644
--- a/drivers/net/ethernet/sfc/ef100_vdpa.c
+++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
@@ -68,9 +68,14 @@ static int vdpa_allocate_vis(struct efx_nic *efx, unsigned int *allocated_vis)
static void ef100_vdpa_delete(struct efx_nic *efx)
{
+ struct vdpa_device *vdpa_dev;
+
if (efx->vdpa_nic) {
+ vdpa_dev = &efx->vdpa_nic->vdpa_dev;
+ ef100_vdpa_reset(vdpa_dev);
+
/* replace with _vdpa_unregister_device later */
- put_device(&efx->vdpa_nic->vdpa_dev.dev);
+ put_device(&vdpa_dev->dev);
}
efx_mcdi_free_vis(efx);
}
@@ -171,6 +176,15 @@ static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
}
}
+ rc = devm_add_action_or_reset(&efx->pci_dev->dev,
+ ef100_vdpa_irq_vectors_free,
+ efx->pci_dev);
+ if (rc) {
+ pci_err(efx->pci_dev,
+ "Failed adding devres for freeing irq vectors\n");
+ goto err_put_device;
+ }
+
rc = get_net_config(vdpa_nic);
if (rc)
goto err_put_device;
diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h b/drivers/net/ethernet/sfc/ef100_vdpa.h
index 348ca8a7404b..58791402e454 100644
--- a/drivers/net/ethernet/sfc/ef100_vdpa.h
+++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
@@ -149,6 +149,8 @@ int ef100_vdpa_register_mgmtdev(struct efx_nic *efx);
void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
void ef100_vdpa_irq_vectors_free(void *data);
int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx);
+void ef100_vdpa_irq_vectors_free(void *data);
+int ef100_vdpa_reset(struct vdpa_device *vdev);
static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic *vdpa_nic)
{
diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
index 0051c4c0e47c..95a2177f85a2 100644
--- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
+++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
@@ -22,11 +22,6 @@ static struct ef100_vdpa_nic *get_vdpa_nic(struct vdpa_device *vdev)
return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
}
-void ef100_vdpa_irq_vectors_free(void *data)
-{
- pci_free_irq_vectors(data);
-}
-
static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
{
struct efx_vring_ctx *vring_ctx;
@@ -52,14 +47,6 @@ static void delete_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
vdpa_nic->vring[idx].vring_ctx = NULL;
}
-static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
-{
- vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
- vdpa_nic->vring[idx].vring_state = 0;
- vdpa_nic->vring[idx].last_avail_idx = 0;
- vdpa_nic->vring[idx].last_used_idx = 0;
-}
-
int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
{
u32 offset;
@@ -103,6 +90,236 @@ static bool is_qid_invalid(struct ef100_vdpa_nic *vdpa_nic, u16 idx,
return false;
}
+static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+ struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
+ struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
+
+ devm_free_irq(&pci_dev->dev, vring->irq, vring);
+ vring->irq = -EINVAL;
+}
+
+static irqreturn_t vring_intr_handler(int irq, void *arg)
+{
+ struct ef100_vdpa_vring_info *vring = arg;
+
+ if (vring->cb.callback)
+ return vring->cb.callback(vring->cb.private);
+
+ return IRQ_NONE;
+}
+
+static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev, u16 nvqs)
+{
+ int rc;
+
+ rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, PCI_IRQ_MSIX);
+ if (rc < 0)
+ pci_err(pci_dev,
+ "Failed to alloc %d IRQ vectors, err:%d\n", nvqs, rc);
+ return rc;
+}
+
+void ef100_vdpa_irq_vectors_free(void *data)
+{
+ pci_free_irq_vectors(data);
+}
+
+static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+ struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
+ struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
+ int irq;
+ int rc;
+
+ snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
+ pci_name(pci_dev), idx);
+ irq = pci_irq_vector(pci_dev, idx);
+ rc = devm_request_irq(&pci_dev->dev, irq, vring_intr_handler, 0,
+ vring->msix_name, vring);
+ if (rc)
+ pci_err(pci_dev,
+ "devm_request_irq failed for vring %d, rc %d\n",
+ idx, rc);
+ else
+ vring->irq = irq;
+
+ return rc;
+}
+
+static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+ struct efx_vring_dyn_cfg vring_dyn_cfg;
+ int rc;
+
+ if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
+ return 0;
+
+ rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
+ &vring_dyn_cfg);
+ if (rc)
+ dev_err(&vdpa_nic->vdpa_dev.dev,
+ "%s: delete vring failed index:%u, err:%d\n",
+ __func__, idx, rc);
+ vdpa_nic->vring[idx].last_avail_idx = vring_dyn_cfg.avail_idx;
+ vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
+ vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
+
+ irq_vring_fini(vdpa_nic, idx);
+
+ return rc;
+}
+
+static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
+{
+ struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+ u32 idx_val;
+
+ if (is_qid_invalid(vdpa_nic, idx, __func__))
+ return;
+
+ if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
+ return;
+
+ idx_val = idx;
+ _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
+ vdpa_nic->vring[idx].doorbell_offset);
+}
+
+static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+ if (vdpa_nic->vring[idx].vring_state == EF100_VRING_CONFIGURED &&
+ vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
+ !(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
+ return true;
+
+ return false;
+}
+
+static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+ struct efx_vring_dyn_cfg vring_dyn_cfg;
+ struct efx_vring_cfg vring_cfg;
+ int rc;
+
+ rc = irq_vring_init(vdpa_nic, idx);
+ if (rc) {
+ dev_err(&vdpa_nic->vdpa_dev.dev,
+ "%s: irq_vring_init failed. index:%u, err:%d\n",
+ __func__, idx, rc);
+ return rc;
+ }
+ vring_cfg.desc = vdpa_nic->vring[idx].desc;
+ vring_cfg.avail = vdpa_nic->vring[idx].avail;
+ vring_cfg.used = vdpa_nic->vring[idx].used;
+ vring_cfg.size = vdpa_nic->vring[idx].size;
+ vring_cfg.features = vdpa_nic->features;
+ vring_cfg.msix_vector = idx;
+ vring_dyn_cfg.avail_idx = vdpa_nic->vring[idx].last_avail_idx;
+ vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
+
+ rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
+ &vring_cfg, &vring_dyn_cfg);
+ if (rc) {
+ dev_err(&vdpa_nic->vdpa_dev.dev,
+ "%s: vring_create failed index:%u, err:%d\n",
+ __func__, idx, rc);
+ goto err_vring_create;
+ }
+ vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
+
+ /* A VQ kick allows the device to read the avail_idx, which will be
+ * required at the destination after live migration.
+ */
+ ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
+
+ return 0;
+
+err_vring_create:
+ irq_vring_fini(vdpa_nic, idx);
+ return rc;
+}
+
+static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+ delete_vring(vdpa_nic, idx);
+ vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
+ vdpa_nic->vring[idx].vring_state = 0;
+ vdpa_nic->vring[idx].last_avail_idx = 0;
+ vdpa_nic->vring[idx].last_used_idx = 0;
+}
+
+static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
+{
+ int i;
+
+ WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
+
+ if (!vdpa_nic->status)
+ return;
+
+ vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
+ vdpa_nic->status = 0;
+ vdpa_nic->features = 0;
+ for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
+ reset_vring(vdpa_nic, i);
+ ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
+}
+
+/* May be called under the rtnl lock */
+int ef100_vdpa_reset(struct vdpa_device *vdev)
+{
+ struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+
+ /* vdpa device can be deleted anytime but the bar_config
+ * could still be vdpa and hence efx->state would be STATE_VDPA.
+ * Accordingly, ensure vdpa device exists before reset handling
+ */
+ if (!vdpa_nic)
+ return -ENODEV;
+
+ mutex_lock(&vdpa_nic->lock);
+ ef100_reset_vdpa_device(vdpa_nic);
+ mutex_unlock(&vdpa_nic->lock);
+ return 0;
+}
+
+static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
+{
+ struct efx_nic *efx = vdpa_nic->efx;
+ struct ef100_nic_data *nic_data;
+ int i, j;
+ int rc;
+
+ nic_data = efx->nic_data;
+ rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
+ vdpa_nic->max_queue_pairs * 2);
+ if (rc < 0) {
+ pci_err(efx->pci_dev,
+ "vDPA IRQ alloc failed for vf: %u err:%d\n",
+ nic_data->vf_index, rc);
+ return rc;
+ }
+
+ for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
+ if (can_create_vring(vdpa_nic, i)) {
+ rc = create_vring(vdpa_nic, i);
+ if (rc)
+ goto clear_vring;
+ }
+ }
+
+ vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;
+ return 0;It's better not to work around userspace bugs in the kernel. We should
+
+clear_vring:
+ for (j = 0; j < i; j++)
+ delete_vring(vdpa_nic, j);
+
+ ef100_vdpa_irq_vectors_free(efx->pci_dev);
+ return rc;
+}
+
static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
u16 idx, u64 desc_area, u64 driver_area,
u64 device_area)
@@ -144,22 +361,6 @@ static void ef100_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
mutex_unlock(&vdpa_nic->lock);
}
-static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
-{
- struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
- u32 idx_val;
-
- if (is_qid_invalid(vdpa_nic, idx, __func__))
- return;
-
- if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
- return;
-
- idx_val = idx;
- _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
- vdpa_nic->vring[idx].doorbell_offset);
-}
-
static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
struct vdpa_callback *cb)
{
@@ -176,6 +377,7 @@ static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
bool ready)
{
struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+ int rc;
if (is_qid_invalid(vdpa_nic, idx, __func__))
return;
@@ -184,9 +386,21 @@ static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
if (ready) {
vdpa_nic->vring[idx].vring_state |=
EF100_VRING_READY_CONFIGURED;
+ if (vdpa_nic->vdpa_state == EF100_VDPA_STATE_STARTED &&
+ can_create_vring(vdpa_nic, idx)) {
+ rc = create_vring(vdpa_nic, idx);
+ if (rc)
+ /* Rollback ready configuration
+ * So that the above layer driver
+ * can make another attempt to set ready
+ */
+ vdpa_nic->vring[idx].vring_state &=
+ ~EF100_VRING_READY_CONFIGURED;
+ }
} else {
vdpa_nic->vring[idx].vring_state &=
~EF100_VRING_READY_CONFIGURED;
+ delete_vring(vdpa_nic, idx);
}
mutex_unlock(&vdpa_nic->lock);
}
@@ -296,6 +510,12 @@ static u64 ef100_vdpa_get_device_features(struct vdpa_device *vdev)
}
features |= BIT_ULL(VIRTIO_NET_F_MAC);
+ /* As QEMU SVQ doesn't implement the following features,
+ * masking them off to allow Live Migration
+ */
+ features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
+ features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
fix Qemu instead.
Will change.
+This is trigger-able by the userspace. It might be better to use
return features;
}
@@ -356,6 +576,77 @@ static u32 ef100_vdpa_get_vendor_id(struct vdpa_device *vdev)
return EF100_VDPA_VENDOR_ID;
}
+static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
+{
+ struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+ u8 status;
+
+ mutex_lock(&vdpa_nic->lock);
+ status = vdpa_nic->status;
+ mutex_unlock(&vdpa_nic->lock);
+ return status;
+}
+
+static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 status)
+{
+ struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+ u8 new_status;
+ int rc;
+
+ mutex_lock(&vdpa_nic->lock);
+ if (!status) {
+ dev_info(&vdev->dev,
+ "%s: Status received is 0. Device reset being done\n",
+ __func__);
dev_dbg() instead.
Ok.
+ ef100_reset_vdpa_device(vdpa_nic);Same here.
+ goto unlock_return;
+ }
+ new_status = status & ~vdpa_nic->status;
+ if (new_status == 0) {
+ dev_info(&vdev->dev,
+ "%s: New status same as current status\n", __func__);
vdpa_state helps to ensure correct status transitions in the .set_status callback and safe-guards against incorrect/malicious user-space driver.
+ goto unlock_return;It might be better to explain the reason we need to track another
+ }
+ if (new_status & VIRTIO_CONFIG_S_FAILED) {
+ ef100_reset_vdpa_device(vdpa_nic);
+ goto unlock_return;
+ }
+
+ if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
+ vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
+ new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
+ }
+ if (new_status & VIRTIO_CONFIG_S_DRIVER) {
+ vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
+ new_status &= ~VIRTIO_CONFIG_S_DRIVER;
+ }
+ if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
+ vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
+ vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;
state in vdpa_state instead of simply using the device status.
Are you referring to the possibility of updating device configuration (eg. MAC address) using .set_config() after suspend operation? Is there any other user triggered operation that falls in this category?
+ new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;Note that the suspension matters for the whole device. It means the
+ }
+ if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
+ vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
+ vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
+ rc = start_vdpa_device(vdpa_nic);
+ if (rc) {
+ dev_err(&vdpa_nic->vdpa_dev.dev,
+ "%s: vDPA device failed:%d\n", __func__, rc);
+ vdpa_nic->status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
+ goto unlock_return;
+ }
+ new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
+ }
+ if (new_status) {
+ dev_warn(&vdev->dev,
+ "%s: Mismatch Status: %x & State: %u\n",
+ __func__, new_status, vdpa_nic->vdpa_state);
+ }
+
+unlock_return:
+ mutex_unlock(&vdpa_nic->lock);
+}
+
static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
{
return sizeof(struct virtio_net_config);
@@ -393,6 +684,20 @@ static void ef100_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
vdpa_nic->mac_configured = true;
}
+static int ef100_vdpa_suspend(struct vdpa_device *vdev)
+{
+ struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+ int i, rc;
+
+ mutex_lock(&vdpa_nic->lock);
+ for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
+ rc = delete_vring(vdpa_nic, i);
config space should not be changed. But the code here only suspends
the vring, is this intende/d?
Could you pls elaborate on the requirement during device reset?
Reset may have the same issue.
[1] https://patchew.org/QEMU/20230213191929.1547497-1-eperezma@xxxxxxxxxx/
Thanks
+ if (rc)
+ break;
+ }
+ mutex_unlock(&vdpa_nic->lock);
+ return rc;
+}
static void ef100_vdpa_free(struct vdpa_device *vdev)
{
struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
@@ -428,9 +733,13 @@ const struct vdpa_config_ops ef100_vdpa_config_ops = {
.get_vq_num_max = ef100_vdpa_get_vq_num_max,
.get_device_id = ef100_vdpa_get_device_id,
.get_vendor_id = ef100_vdpa_get_vendor_id,
+ .get_status = ef100_vdpa_get_status,
+ .set_status = ef100_vdpa_set_status,
+ .reset = ef100_vdpa_reset,
.get_config_size = ef100_vdpa_get_config_size,
.get_config = ef100_vdpa_get_config,
.set_config = ef100_vdpa_set_config,
.get_generation = NULL,
+ .suspend = ef100_vdpa_suspend,
.free = ef100_vdpa_free,
};
--
2.30.1