Re: [RFC 2/2] vhost: IFC VF vdpa layer

From: Zhu, Lingshan
Date: Wed Oct 23 2019 - 05:24:38 EST



On 10/22/2019 2:53 PM, Zhu Lingshan wrote:

On 10/21/2019 6:19 PM, Jason Wang wrote:

On 2019/10/21 äå5:53, Zhu, Lingshan wrote:

On 10/16/2019 6:19 PM, Jason Wang wrote:

On 2019/10/16 äå9:30, Zhu Lingshan wrote:
This commit introduced IFC VF operations for vdpa, which complys to
vhost_mdev interfaces, handles IFC VF initialization,
configuration and removal.

Signed-off-by: Zhu Lingshan <lingshan.zhu@xxxxxxxxx>
---
 drivers/vhost/ifcvf/ifcvf_main.c | 541 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 541 insertions(+)
 create mode 100644 drivers/vhost/ifcvf/ifcvf_main.c

diff --git a/drivers/vhost/ifcvf/ifcvf_main.c b/drivers/vhost/ifcvf/ifcvf_main.c
new file mode 100644
index 000000000000..c48a29969a85
--- /dev/null
+++ b/drivers/vhost/ifcvf/ifcvf_main.c
@@ -0,0 +1,541 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 Intel Corporation.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+
+#include "ifcvf_base.h"
+
+#define VERSION_STRINGÂÂÂ "0.1"
+#define DRIVER_AUTHORÂÂÂ "Intel Corporation"
+#define IFCVF_DRIVER_NAMEÂÂÂ "ifcvf"
+
+static irqreturn_t ifcvf_intr_handler(int irq, void *arg)
+{
+ÂÂÂ struct vring_info *vring = arg;
+
+ÂÂÂ if (vring->cb.callback)
+ÂÂÂÂÂÂÂ return vring->cb.callback(vring->cb.private);
+
+ÂÂÂ return IRQ_HANDLED;
+}
+
+static u64 ifcvf_mdev_get_features(struct mdev_device *mdev)
+{
+ÂÂÂ return IFC_SUPPORTED_FEATURES;


I would expect this should be done by querying the hw. Or IFC VF can't get any update through its firmware?

Hi Jason,

Thanks for your comments, for now driver just support these features.


Ok, it should work but less flexible, we can change it in the future.
sure!





+}
+
+static int ifcvf_mdev_set_features(struct mdev_device *mdev, u64 features)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->req_features = features;
+
+ÂÂÂ return 0;
+}
+
+static u64 ifcvf_mdev_get_vq_state(struct mdev_device *mdev, u16 qid)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ return vf->vring[qid].last_avail_idx;


Does this really work? I'd expect it should be fetched from hw since it's an internal state.
for now, it's working, we intend to support LM in next version drivers.


I'm not sure I understand here, I don't see any synchronization between the hardware and last_avail_idx, so last_avail_idx should not change.

Btw, what did "LM" mean :) ?

I can add bar IO operations here, LM = live migration, sorry for the abbreviation.





+}
+
+static int ifcvf_mdev_set_vq_state(struct mdev_device *mdev, u16 qid, u64 num)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->vring[qid].last_used_idx = num;


I fail to understand why last_used_idx is needed. It looks to me the used idx in the used ring is sufficient.
I will remove it.


+ vf->vring[qid].last_avail_idx = num;


Do we need a synchronization with hw immediately here?


+
+ÂÂÂ return 0;
+}
+
+static int ifcvf_mdev_set_vq_address(struct mdev_device *mdev, u16 idx,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ u64 desc_area, u64 driver_area,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ u64 device_area)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->vring[idx].desc = desc_area;
+ÂÂÂ vf->vring[idx].avail = driver_area;
+ÂÂÂ vf->vring[idx].used = device_area;
+
+ÂÂÂ return 0;
+}
+
+static void ifcvf_mdev_set_vq_num(struct mdev_device *mdev, u16 qid, u32 num)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->vring[qid].size = num;
+}
+
+static void ifcvf_mdev_set_vq_ready(struct mdev_device *mdev,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ u16 qid, bool ready)
+{
+
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->vring[qid].ready = ready;
+}
+
+static bool ifcvf_mdev_get_vq_ready(struct mdev_device *mdev, u16 qid)
+{
+
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ return vf->vring[qid].ready;
+}
+
+static void ifcvf_mdev_set_vq_cb(struct mdev_device *mdev, u16 idx,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct virtio_mdev_callback *cb)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->vring[idx].cb = *cb;
+}
+
+static void ifcvf_mdev_kick_vq(struct mdev_device *mdev, u16 idx)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ ifcvf_notify_queue(vf, idx);
+}
+
+static u8 ifcvf_mdev_get_status(struct mdev_device *mdev)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ return vf->status;
+}
+
+static u32 ifcvf_mdev_get_generation(struct mdev_device *mdev)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ return vf->generation;
+}
+
+static int ifcvf_mdev_get_version(struct mdev_device *mdev)
+{
+ÂÂÂ return VIRTIO_MDEV_VERSION;
+}
+
+static u32 ifcvf_mdev_get_device_id(struct mdev_device *mdev)
+{
+ÂÂÂ return IFCVF_DEVICE_ID;
+}
+
+static u32 ifcvf_mdev_get_vendor_id(struct mdev_device *mdev)
+{
+ÂÂÂ return IFCVF_VENDOR_ID;
+}
+
+static u16 ifcvf_mdev_get_vq_align(struct mdev_device *mdev)
+{
+ÂÂÂ return IFCVF_QUEUE_ALIGNMENT;
+}
+
+static int ifcvf_start_datapath(void *private)
+{
+ÂÂÂ int i, ret;
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(private);
+
+ÂÂÂ for (i = 0; i < (IFCVF_MAX_QUEUE_PAIRS * 2); i++) {
+ÂÂÂÂÂÂÂ if (!vf->vring[i].ready)
+ÂÂÂÂÂÂÂÂÂÂÂ break;


Looks like error should be returned here?
agreed!


+
+ÂÂÂÂÂÂÂ if (!vf->vring[i].size)
+ÂÂÂÂÂÂÂÂÂÂÂ break;
+
+ÂÂÂÂÂÂÂ if (!vf->vring[i].desc || !vf->vring[i].avail ||
+ÂÂÂÂÂÂÂÂÂÂÂ !vf->vring[i].used)
+ÂÂÂÂÂÂÂÂÂÂÂ break;
+ÂÂÂ }
+ÂÂÂ vf->nr_vring = i;
+
+ÂÂÂ ret = ifcvf_start_hw(vf);
+ÂÂÂ return ret;
+}
+
+static int ifcvf_stop_datapath(void *private)
+{
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(private);
+ÂÂÂ int i;
+
+ÂÂÂ for (i = 0; i < IFCVF_MAX_QUEUES; i++)
+ÂÂÂÂÂÂÂ vf->vring[i].cb.callback = NULL;


Any synchronization is needed for the vq irq handler?
I think even we set callback = NULL, the code is still there, on-going routines would not be effected.


Ok I think you mean when ifcvf_stop_hw() return, hardware will not respond to e.g kick and other events etc.
Yes, ifcvf_stop_hw() would disable the hw.




+
+ÂÂÂ ifcvf_stop_hw(vf);
+
+ÂÂÂ return 0;
+}
+
+static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
+{
+ÂÂÂ int i;
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+ÂÂÂÂÂÂÂ vf->vring[i].last_used_idx = 0;
+ÂÂÂÂÂÂÂ vf->vring[i].last_avail_idx = 0;
+ÂÂÂÂÂÂÂ vf->vring[i].desc = 0;
+ÂÂÂÂÂÂÂ vf->vring[i].avail = 0;
+ÂÂÂÂÂÂÂ vf->vring[i].used = 0;
+ÂÂÂÂÂÂÂ vf->vring[i].ready = 0;
+ÂÂÂÂÂÂÂ vf->vring->cb.callback = NULL;
+ÂÂÂÂÂÂÂ vf->vring->cb.private = NULL;
+ÂÂÂ }
+}
+
+static void ifcvf_mdev_set_status(struct mdev_device *mdev, u8 status)
+{
+ÂÂÂ struct ifcvf_adapter *adapter = mdev_get_drvdata(mdev);
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+
+ÂÂÂ vf->status = status;
+
+ÂÂÂ if (status == 0) {
+ÂÂÂÂÂÂÂ ifcvf_stop_datapath(adapter);
+ÂÂÂÂÂÂÂ ifcvf_reset_vring(adapter);
+ÂÂÂÂÂÂÂ return;
+ÂÂÂ }
+
+ÂÂÂ if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ÂÂÂÂÂÂÂ ifcvf_start_datapath(adapter);
+ÂÂÂÂÂÂÂ return;
+ÂÂÂ }
+}
+
+static u16 ifcvf_mdev_get_queue_max(struct mdev_device *mdev)
+{
+ÂÂÂ return IFCVF_MAX_QUEUES;


The name is confusing, it was used to return the maximum queue size. In new version of virtio-mdev, the callback was renamed as get_vq_num_max().
will change that.


+}
+
+static struct virtio_mdev_device_ops ifc_mdev_ops = {
+ .get_features = ifcvf_mdev_get_features,
+ .set_features = ifcvf_mdev_set_features,
+ÂÂÂ .get_statusÂÂÂ = ifcvf_mdev_get_status,
+ÂÂÂ .set_statusÂÂÂ = ifcvf_mdev_set_status,
+ÂÂÂ .get_queue_max = ifcvf_mdev_get_queue_max,
+ÂÂÂ .get_vq_stateÂÂ = ifcvf_mdev_get_vq_state,
+ÂÂÂ .set_vq_stateÂÂ = ifcvf_mdev_set_vq_state,
+ÂÂÂ .set_vq_cbÂÂÂÂÂ = ifcvf_mdev_set_vq_cb,
+ÂÂÂ .set_vq_readyÂÂ = ifcvf_mdev_set_vq_ready,
+ÂÂÂ .get_vq_readyÂÂÂ = ifcvf_mdev_get_vq_ready,
+ÂÂÂ .set_vq_numÂÂÂÂ = ifcvf_mdev_set_vq_num,
+ÂÂÂ .set_vq_address = ifcvf_mdev_set_vq_address,
+ÂÂÂ .kick_vqÂÂÂÂÂÂÂ = ifcvf_mdev_kick_vq,
+ÂÂÂ .get_generationÂÂÂ = ifcvf_mdev_get_generation,
+ÂÂÂ .get_versionÂÂÂ = ifcvf_mdev_get_version,
+ÂÂÂ .get_device_idÂÂÂ = ifcvf_mdev_get_device_id,
+ÂÂÂ .get_vendor_idÂÂÂ = ifcvf_mdev_get_vendor_id,
+ÂÂÂ .get_vq_alignÂÂÂ = ifcvf_mdev_get_vq_align,
+};


set_config/get_config is missing. It looks to me they are not hard, just implementing the access to dev_cfg. It's key to make kernel virtio driver to work.

And in the new version of virito-mdev, features like _F_LOG_ALL should be advertised through get_mdev_features.
IMHO, currently the driver can work without set/get_config, vhost_mdev doesn't call them for now.


Yes, but it was required by virtio_mdev for host driver to work, and it looks to me it's not hard to add them. If possible please add them and "virtio" type then we can use the ops for both the case of VM and containers.
sure

Hello Jason,

Just want to double confirm the implementation of set/get_config, for now, dev_cfg only contains mac[6], status and max_virtqueue_pairs, is that enough to support virtio_mdev?

THanks!





+
+static int ifcvf_init_msix(struct ifcvf_adapter *adapter)
+{
+ÂÂÂ int vector, i, ret, irq;
+ÂÂÂ struct pci_dev *pdev = to_pci_dev(adapter->dev);
+ÂÂÂ struct ifcvf_hw *vf = &adapter->vf;
+
+ÂÂÂ ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR,
+ÂÂÂÂÂÂÂÂÂÂÂ IFCVF_MAX_INTR, PCI_IRQ_MSIX);
+ÂÂÂ if (ret < 0) {
+ÂÂÂÂÂÂÂ IFC_ERR(adapter->dev, "Failed to alloc irq vectors.\n");
+ÂÂÂÂÂÂÂ return ret;
+ÂÂÂ }
+
+ÂÂÂ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+ÂÂÂÂÂÂÂ vector = i + IFCVF_MSI_QUEUE_OFF;
+ÂÂÂÂÂÂÂ irq = pci_irq_vector(pdev, vector);
+ÂÂÂÂÂÂÂ ret = request_irq(irq, ifcvf_intr_handler, 0,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pci_name(pdev), &vf->vring[i]);
+ÂÂÂÂÂÂÂ if (ret) {
+ÂÂÂÂÂÂÂÂÂÂÂ IFC_ERR(adapter->dev,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ "Failed to request irq for vq %d.\n", i);
+ÂÂÂÂÂÂÂÂÂÂÂ return ret;
+ÂÂÂÂÂÂÂ }
+ÂÂÂ }


Do we need to provide fallback when we can't do per vq MSIX?
I think it would be very rarely that can not get enough vectors.


Right.




+
+ÂÂÂ return 0;
+}
+
+static void ifcvf_destroy_adapter(struct ifcvf_adapter *adapter)
+{
+ÂÂÂ int i, vector, irq;
+ÂÂÂ struct ifcvf_hw *vf = IFC_PRIVATE_TO_VF(adapter);
+ÂÂÂ struct pci_dev *pdev = to_pci_dev(adapter->dev);
+
+ÂÂÂ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+ÂÂÂÂÂÂÂ vector = i + IFCVF_MSI_QUEUE_OFF;
+ÂÂÂÂÂÂÂ irq = pci_irq_vector(pdev, vector);
+ÂÂÂÂÂÂÂ free_irq(irq, &vf->vring[i]);
+ÂÂÂ }
+}
+
+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ÂÂÂ const char *name = "vhost accelerator (virtio ring compatible)";
+
+ÂÂÂ return sprintf(buf, "%s\n", name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ char *buf)
+{
+ÂÂÂ return sprintf(buf, "%s\n", VIRTIO_MDEV_DEVICE_API_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static ssize_t available_instances_show(struct kobject *kobj,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct device *dev, char *buf)
+{
+ÂÂÂ struct pci_dev *pdev = to_pci_dev(dev);
+ÂÂÂ struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+
+ÂÂÂ return sprintf(buf, "%d\n", adapter->mdev_count);
+}
+
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t type_show(struct kobject *kobj,
+ÂÂÂÂÂÂÂÂÂÂÂ struct device *dev, char *buf)
+{
+ÂÂÂ return sprintf(buf, "%s\n", "net");
+}
+
+MDEV_TYPE_ATTR_RO(type);
+
+
+static struct attribute *mdev_types_attrs[] = {
+ÂÂÂ &mdev_type_attr_name.attr,
+ÂÂÂ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ÂÂÂ &mdev_type_attr_type.attr,
+ÂÂÂ NULL,
+};
+
+static struct attribute_group mdev_type_group = {
+ .name = "vdpa_virtio",


To be consistent, it should be "vhost" or "virtio".
agreed!


+ÂÂÂ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+ÂÂÂ &mdev_type_group,
+ÂÂÂ NULL,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+ÂÂÂ NULL,
+};
+
+static int ifcvf_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ÂÂÂ struct device *dev = mdev_parent_dev(mdev);
+ÂÂÂ struct pci_dev *pdev = to_pci_dev(dev);
+ÂÂÂ struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+ÂÂÂ int ret = 0;
+
+ÂÂÂ mutex_lock(&adapter->mdev_lock);
+
+ÂÂÂ if (adapter->mdev_count < 1) {
+ÂÂÂÂÂÂÂ ret = -EINVAL;
+ÂÂÂÂÂÂÂ goto out;
+ÂÂÂ }
+
+ÂÂÂ mdev_set_class_id(mdev, MDEV_ID_VHOST);
+ÂÂÂ mdev_set_dev_ops(mdev, &ifc_mdev_ops);
+
+ÂÂÂ mdev_set_drvdata(mdev, adapter);
+ÂÂÂ mdev_set_iommu_device(mdev_dev(mdev), dev);
+
+ÂÂÂ INIT_LIST_HEAD(&adapter->dma_maps);
+ÂÂÂ adapter->mdev_count--;
+
+out:
+ÂÂÂ mutex_unlock(&adapter->mdev_lock);
+ÂÂÂ return ret;
+}
+
+static int ifcvf_mdev_remove(struct mdev_device *mdev)
+{
+ÂÂÂ struct device *dev = mdev_parent_dev(mdev);
+ÂÂÂ struct pci_dev *pdev = to_pci_dev(dev);
+ÂÂÂ struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+
+ÂÂÂ mutex_lock(&adapter->mdev_lock);
+ÂÂÂ adapter->mdev_count++;
+ÂÂÂ mutex_unlock(&adapter->mdev_lock);
+
+ÂÂÂ return 0;
+}
+
+static struct mdev_parent_ops ifcvf_mdev_fops = {
+ÂÂÂ .ownerÂÂÂÂÂÂÂÂÂÂÂ = THIS_MODULE,
+ÂÂÂ .supported_type_groupsÂÂÂ = mdev_type_groups,
+ÂÂÂ .mdev_attr_groupsÂÂÂ = mdev_dev_groups,
+ÂÂÂ .createÂÂÂÂÂÂÂÂÂÂÂ = ifcvf_mdev_create,
+ÂÂÂ .removeÂÂÂÂÂÂÂÂÂÂÂ = ifcvf_mdev_remove,
+};
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ÂÂÂ struct device *dev = &pdev->dev;
+ÂÂÂ struct ifcvf_adapter *adapter;
+ÂÂÂ struct ifcvf_hw *vf;
+ÂÂÂ int ret, i;
+
+ÂÂÂ adapter = kzalloc(sizeof(struct ifcvf_adapter), GFP_KERNEL);
+ÂÂÂ if (adapter == NULL) {
+ÂÂÂÂÂÂÂ ret = -ENOMEM;
+ÂÂÂÂÂÂÂ goto fail;
+ÂÂÂ }
+
+ÂÂÂ mutex_init(&adapter->mdev_lock);
+ÂÂÂ adapter->mdev_count = 1;


So this is per VF based vDPA implementation, which seems not convenient for management. Anyhow we can control the creation in PF?

Thanks
the driver scope for now doesn't support that, we can add these feature in next releases.


Not a must for this series, but to have a better interaction with management like libvirt, it's better.

Btw, do you have the plan to post PF drivers?

Thanks
Maybe we can workout PF driver in next release :)
THanks,
BR
Zhu Lingshan