Re: [PATCH v5 05/14] vfio/mdev: idxd: add basic mdev registration and helper functions

From: Jason Gunthorpe
Date: Wed Feb 10 2021 - 19:00:41 EST


On Fri, Feb 05, 2021 at 01:53:24PM -0700, Dave Jiang wrote:

> +static int check_vma(struct idxd_wq *wq, struct vm_area_struct *vma)
> {
> - /* FIXME: Fill in later */
> + if (vma->vm_end < vma->vm_start)
> + return -EINVAL;

These checks are redundant

> -static int idxd_mdev_host_release(struct idxd_device *idxd)
> +static int idxd_vdcm_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
> +{
> + unsigned int wq_idx, rc;
> + unsigned long req_size, pgoff = 0, offset;
> + pgprot_t pg_prot;
> + struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
> + struct idxd_wq *wq = vidxd->wq;
> + struct idxd_device *idxd = vidxd->idxd;
> + enum idxd_portal_prot virt_portal, phys_portal;
> + phys_addr_t base = pci_resource_start(idxd->pdev, IDXD_WQ_BAR);
> + struct device *dev = mdev_dev(mdev);
> +
> + rc = check_vma(wq, vma);
> + if (rc)
> + return rc;
> +
> + pg_prot = vma->vm_page_prot;
> + req_size = vma->vm_end - vma->vm_start;
> + vma->vm_flags |= VM_DONTCOPY;
> +
> + offset = (vma->vm_pgoff << PAGE_SHIFT) &
> + ((1ULL << VFIO_PCI_OFFSET_SHIFT) - 1);
> +
> + wq_idx = offset >> (PAGE_SHIFT + 2);
> + if (wq_idx >= 1) {
> + dev_err(dev, "mapping invalid wq %d off %lx\n",
> + wq_idx, offset);
> + return -EINVAL;
> + }
> +
> + /*
> + * Check and see if the guest wants to map to the limited or unlimited portal.
> + * The driver will allow mapping to unlimited portal only if the the wq is a
> + * dedicated wq. Otherwise, it goes to limited.
> + */
> + virt_portal = ((offset >> PAGE_SHIFT) & 0x3) == 1;
> + phys_portal = IDXD_PORTAL_LIMITED;
> + if (virt_portal == IDXD_PORTAL_UNLIMITED && wq_dedicated(wq))
> + phys_portal = IDXD_PORTAL_UNLIMITED;
> +
> + /* We always map IMS portals to the guest */
> + pgoff = (base + idxd_get_wq_portal_full_offset(wq->id, phys_portal,
> + IDXD_IRQ_IMS)) >> PAGE_SHIFT;
> + dev_dbg(dev, "mmap %lx %lx %lx %lx\n", vma->vm_start, pgoff, req_size,
> + pgprot_val(pg_prot));
> + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> + vma->vm_private_data = mdev;

What ensures the mdev pointer is valid strictly longer than the VMA?
This needs refcounting.

> + vma->vm_pgoff = pgoff;
> +
> + return remap_pfn_range(vma, vma->vm_start, pgoff, req_size, pg_prot);

Nothing validated req_size - did you copy this from the Intel RDMA
driver? It had a huge security bug just like this.
> +
> +static int msix_trigger_unregister(struct vdcm_idxd *vidxd, int index)
> +{
> + struct mdev_device *mdev = vidxd->vdev.mdev;
> + struct device *dev = mdev_dev(mdev);
> + struct ims_irq_entry *irq_entry;
> + int rc;
> +
> + if (!vidxd->vdev.msix_trigger[index])
> + return 0;
> +
> + dev_dbg(dev, "disable MSIX trigger %d\n", index);
> + if (index) {
> + u32 auxval;
> +
> + irq_entry = &vidxd->irq_entries[index];
> + if (irq_entry->irq_set) {
> + free_irq(irq_entry->irq, irq_entry);
> + irq_entry->irq_set = false;
> + }
> +
> + auxval = ims_ctrl_pasid_aux(0, false);
> + rc = irq_set_auxdata(irq_entry->irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> + if (rc)
> + return rc;
> + }
> + eventfd_ctx_put(vidxd->vdev.msix_trigger[index]);
> + vidxd->vdev.msix_trigger[index] = NULL;
> +
> + return 0;
> +}
> +
> +static int msix_trigger_register(struct vdcm_idxd *vidxd, u32 fd, int index)
> +{
> + struct mdev_device *mdev = vidxd->vdev.mdev;
> + struct device *dev = mdev_dev(mdev);
> + struct ims_irq_entry *irq_entry;
> + struct eventfd_ctx *trigger;
> + int rc;
> +
> + if (vidxd->vdev.msix_trigger[index])
> + return 0;
> +
> + dev_dbg(dev, "enable MSIX trigger %d\n", index);
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger)) {
> + dev_warn(dev, "eventfd_ctx_fdget failed %d\n", index);
> + return PTR_ERR(trigger);
> + }
> +
> + if (index) {
> + u32 pasid;
> + u32 auxval;
> +
> + irq_entry = &vidxd->irq_entries[index];
> + rc = idxd_mdev_get_pasid(mdev, &pasid);
> + if (rc < 0)
> + return rc;
> +
> + /*
> + * Program and enable the pasid field in the IMS entry. The programmed pasid and
> + * enabled field is checked against the pasid and enable field for the work queue
> + * configuration and the pasid for the descriptor. A mismatch will result in blocked
> + * IMS interrupt.
> + */
> + auxval = ims_ctrl_pasid_aux(pasid, true);
> + rc = irq_set_auxdata(irq_entry->irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> + if (rc < 0)
> + return rc;
> +
> + rc = request_irq(irq_entry->irq, idxd_guest_wq_completion, 0, "idxd-ims",
> + irq_entry);
> + if (rc) {
> + dev_warn(dev, "failed to request ims irq\n");
> + eventfd_ctx_put(trigger);
> + auxval = ims_ctrl_pasid_aux(0, false);
> + irq_set_auxdata(irq_entry->irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> + return rc;
> + }
> + irq_entry->irq_set = true;
> + }
> +
> + vidxd->vdev.msix_trigger[index] = trigger;
> + return 0;
> +}
> +
> +static int vdcm_idxd_set_msix_trigger(struct vdcm_idxd *vidxd,
> + unsigned int index, unsigned int start,
> + unsigned int count, uint32_t flags,
> + void *data)
> +{
> + int i, rc = 0;
> +
> + if (count > VIDXD_MAX_MSIX_ENTRIES - 1)
> + count = VIDXD_MAX_MSIX_ENTRIES - 1;
> +
> + if (count == 0 && (flags & VFIO_IRQ_SET_DATA_NONE)) {
> + /* Disable all MSIX entries */
> + for (i = 0; i < VIDXD_MAX_MSIX_ENTRIES; i++) {
> + rc = msix_trigger_unregister(vidxd, i);
> + if (rc < 0)
> + return rc;
> + }
> + return 0;
> + }
> +
> + for (i = 0; i < count; i++) {
> + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
> + u32 fd = *(u32 *)(data + i * sizeof(u32));
> +
> + rc = msix_trigger_register(vidxd, fd, i);
> + if (rc < 0)
> + return rc;
> + } else if (flags & VFIO_IRQ_SET_DATA_NONE) {
> + rc = msix_trigger_unregister(vidxd, i);
> + if (rc < 0)
> + return rc;
> + }
> + }
> + return rc;
> +}
> +
> +static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags,
> + unsigned int index, unsigned int start,
> + unsigned int count, void *data)
> +{
> + int (*func)(struct vdcm_idxd *vidxd, unsigned int index,
> + unsigned int start, unsigned int count, uint32_t flags,
> + void *data) = NULL;
> + struct mdev_device *mdev = vidxd->vdev.mdev;
> + struct device *dev = mdev_dev(mdev);
> +
> + switch (index) {
> + case VFIO_PCI_INTX_IRQ_INDEX:
> + dev_warn(dev, "intx interrupts not supported.\n");
> + break;
> + case VFIO_PCI_MSI_IRQ_INDEX:
> + dev_dbg(dev, "msi interrupt.\n");
> + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
> + case VFIO_IRQ_SET_ACTION_MASK:
> + case VFIO_IRQ_SET_ACTION_UNMASK:
> + break;
> + case VFIO_IRQ_SET_ACTION_TRIGGER:
> + func = vdcm_idxd_set_msix_trigger;

This would be a good place to insert a common VFIO helper library to
take care of the MSI-X emulation for IMS.

> +int idxd_mdev_host_init(struct idxd_device *idxd)
> +{
> + struct device *dev = &idxd->pdev->dev;
> + int rc;
> +
> + if (!test_bit(IDXD_FLAG_IMS_SUPPORTED, &idxd->flags))
> + return -EOPNOTSUPP;
> +
> + if (iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)) {
> + rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_AUX);

Huh. This is the first user of IOMMU_DEV_FEAT_AUX, why has so much
dead-code infrastructure been already merged around this?


> @@ -34,6 +1024,7 @@ static int idxd_mdev_aux_probe(struct auxiliary_device *auxdev,
> return rc;
> }
>
> + set_bit(IDXD_FLAG_MDEV_ENABLED, &idxd->flags);

Something is being done wrong if this flag is needed

> +int vidxd_send_interrupt(struct ims_irq_entry *iie)
> +{
> + /* PLACE HOLDER */
> + return 0;
> +}

Here too, don't structure the patches like this

> diff --git a/drivers/vfio/mdev/idxd/vdev.h b/drivers/vfio/mdev/idxd/vdev.h
> new file mode 100644
> index 000000000000..cc2ba6ccff7b
> +++ b/drivers/vfio/mdev/idxd/vdev.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
> +
> +#ifndef _IDXD_VDEV_H_
> +#define _IDXD_VDEV_H_
> +
> +#include "mdev.h"
> +
> +int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
> +int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
> +int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
> +int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size);
> +void vidxd_mmio_init(struct vdcm_idxd *vidxd);
> +void vidxd_reset(struct vdcm_idxd *vidxd);
> +int vidxd_send_interrupt(struct ims_irq_entry *iie);
> +int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd);
> +void vidxd_free_ims_entries(struct vdcm_idxd *vidxd);

Why are these functions special??

Jason