RE: [RFC 06/20] iommu: Add iommu_device_init[exit]_user_dma interfaces

From: Tian, Kevin
Date: Mon Sep 27 2021 - 05:43:05 EST


> From: Jason Gunthorpe <jgg@xxxxxxxxxx>
> Sent: Wednesday, September 22, 2021 8:40 PM
>
> > > Ie the basic flow would see the driver core doing some:
> >
> > Just double confirm. Is there concern on having the driver core to
> > call iommu functions?
>
> It is always an interesting question, but I'd say iommu is
> foundantional to Linux and if it needs driver core help it shouldn't
> be any different from PM, pinctl, or other subsystems that have
> inserted themselves into the driver core.
>
> Something kind of like the below.
>
> If I recall, once it is done like this then the entire iommu notifier
> infrastructure can be ripped out which is a lot of code.

Currently vfio is the only user of this notifier mechanism. Now
three events are handled in vfio_iommu_group_notifier():

NOTIFY_ADD_DEVICE: this is basically for some sanity check. suppose
not required once we handle it cleanly in the iommu/driver core.

NOTIFY_BOUND_DRIVER: the BUG_ON() logic to be fixed by this change.

NOTIFY_UNBOUND_DRIVER: still needs some thoughts. Based on
the comments the group->unbound_list is used to avoid breaking
group viability check between vfio_unregister_group_dev() and
final dev/drv teardown. within that small window the device is
not tracked by vfio group but is still bound to a driver (e.g. vfio-pci
itself), while an external group user may hold a reference to the
group. Possibly it's not required now with the new mechanism as
we rely on init/exit_user_dma() as the single switch to claim/
withdraw the group ownership. As long as exit_user_dma() is not
called until vfio_group_release(), above small window is covered
thus no need to maintain a unbound_list.

But anyway since this corner case is tricky, will think more in case
of any oversight.

>
>
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 68ea1f949daa90..e39612c99c6123 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -566,6 +566,10 @@ static int really_probe(struct device *dev, struct
> device_driver *drv)
> goto done;
> }
>
> + ret = iommu_set_kernel_ownership(dev);
> + if (ret)
> + return ret;
> +
> re_probe:
> dev->driver = drv;
>
> @@ -673,6 +677,7 @@ static int really_probe(struct device *dev, struct
> device_driver *drv)
> dev->pm_domain->dismiss(dev);
> pm_runtime_reinit(dev);
> dev_pm_set_driver_flags(dev, 0);
> + iommu_release_kernel_ownership(dev);
> done:
> return ret;
> }
> @@ -1214,6 +1219,7 @@ static void __device_release_driver(struct device
> *dev, struct device *parent)
> dev->pm_domain->dismiss(dev);
> pm_runtime_reinit(dev);
> dev_pm_set_driver_flags(dev, 0);
> + iommu_release_kernel_ownership(dev);
>
> klist_remove(&dev->p->knode_driver);
> device_pm_check_callbacks(dev);

I expanded above into below conceptual draft. Please help check whether
it matches your thought:

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 68ea1f9..826a651 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -566,6 +566,10 @@ static int really_probe(struct device *dev, struct device_driver *drv)
goto done;
}

+ ret = iommu_device_set_dma_hint(dev, drv->dma_hint);
+ if (ret)
+ return ret;
+
re_probe:
dev->driver = drv;

@@ -673,6 +677,7 @@ static int really_probe(struct device *dev, struct device_driver *drv)
dev->pm_domain->dismiss(dev);
pm_runtime_reinit(dev);
dev_pm_set_driver_flags(dev, 0);
+ iommu_device_clear_dma_hint(dev);
done:
return ret;
}
@@ -1214,6 +1219,7 @@ static void __device_release_driver(struct device *dev, struct device *parent)
dev->pm_domain->dismiss(dev);
pm_runtime_reinit(dev);
dev_pm_set_driver_flags(dev, 0);
+ iommu_device_clear_dma_hint(dev);

klist_remove(&dev->p->knode_driver);
device_pm_check_callbacks(dev);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3303d70..b12f335 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1064,6 +1064,104 @@ void iommu_group_put(struct iommu_group *group)
}
EXPORT_SYMBOL_GPL(iommu_group_put);

+static int iommu_dev_viable(struct device *dev, void *data)
+{
+ enum dma_hint hint = *data;
+ struct device_driver *drv = READ_ONCE(dev->driver);
+
+ /* no conflict if the new device doesn't do DMA */
+ if (hint == DMA_FOR_NONE)
+ return 0;
+
+ /* no conflict if this device is driver-less, or doesn't do DMA */
+ if (!drv || (drv->dma_hint == DMA_FOR_NONE))
+ return 0;
+
+ /* kernel dma and user dma are exclusive */
+ if (hint != drv->dma_hint)
+ return -EINVAL;
+
+ /*
+ * devices in the group could be bound to different user-dma
+ * drivers (e.g. vfio-pci, vdpa, etc.), or even bound to the
+ * same driver but eventually opened via different mechanisms
+ * (e.g. vfio group vs. nongroup interfaces). We rely on
+ * iommu_{group/device}_init_user_dma() to ensure exclusive
+ * user-dma ownership (iommufd ctx, vfio container ctx, etc.)
+ * in such scenario.
+ */
+ return 0;
+}
+
+static int __iommu_group_viable(struct iommu_group *group, enum dma_hint hint)
+{
+ return (__iommu_group_for_each_dev(group, &hint,
+ iommu_dev_viable) == 0);
+}
+
+int iommu_device_set_dma_hint(struct device *dev, enum dma_hint hint)
+{
+ struct iommu_group *group;
+ int ret;
+
+ group = iommu_group_get(dev);
+ /* not an iommu-probed device */
+ if (!group)
+ return 0;
+
+ mutex_lock(&group->mutex);
+ ret = __iommu_group_viable(group, hint);
+ mutex_unlock(&group->mutex);
+
+ iommu_group_put(group);
+ return ret;
+}
+
+/* any housekeeping? */
+void iommu_device_clear_dma_hint(struct device *dev) {}
+
+/* claim group ownership for user-dma */
+int __iommu_group_init_user_dma(struct iommu_group *group,
+ unsigned long owner)
+{
+ int ret;
+
+ ret = __iommu_group_viable(group, DMA_FOR_USER);
+ if (ret)
+ goto out;
+
+ /* other logic for exclusive user_dma ownership and refcounting */
+out:
+ return ret;
+}
+
+int iommu_group_init_user_dma(struct iommu_group *group,
+ unsigned long owner)
+{
+ int ret;
+
+ mutex_lock(&group->mutex);
+ ret = __iommu_group_init_user_dma(group, owner);
+ mutex_unlock(&group->mutex);
+ return ret;
+}
+
+int iommu_device_init_user_dma(struct device *dev,
+ unsigned long owner)
+{
+ struct iommu_group *group = iommu_group_get(dev);
+ int ret;
+
+ if (!group)
+ return -ENODEV;
+
+ mutex_lock(&group->mutex);
+ ret = __iommu_group_init_user_dma(group, owner);
+ mutex_unlock(&group->mutex);
+ iommu_grou_put(group);
+ return ret;
+}
+
/**
* iommu_group_register_notifier - Register a notifier for group changes
* @group: the group to watch
diff --git a/drivers/pci/pci-stub.c b/drivers/pci/pci-stub.c
index e408099..4568811 100644
--- a/drivers/pci/pci-stub.c
+++ b/drivers/pci/pci-stub.c
@@ -36,6 +36,9 @@ static int pci_stub_probe(struct pci_dev *dev, const struct pci_device_id *id)
.name = "pci-stub",
.id_table = NULL, /* only dynamic id's */
.probe = pci_stub_probe,
+ .driver = {
+ .dma_hint = DMA_FOR_NONE,
+ },
};

static int __init pci_stub_init(void)
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index dcd648e..a613b78 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -678,6 +678,9 @@ static void ifcvf_remove(struct pci_dev *pdev)
.id_table = ifcvf_pci_ids,
.probe = ifcvf_probe,
.remove = ifcvf_remove,
+ .driver = {
+ .dma_hint = DMA_FOR_USER,
+ },
};

module_pci_driver(ifcvf_driver);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a5ce92b..61c422d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -193,6 +193,9 @@ static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
.remove = vfio_pci_remove,
.sriov_configure = vfio_pci_sriov_configure,
.err_handler = &vfio_pci_core_err_handlers,
+ .driver = {
+ .dma_hint = DMA_FOR_USER,
+ },
};

static void __init vfio_pci_fill_ids(void)
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index a498ebc..6bddfd2 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -48,6 +48,17 @@ enum probe_type {
};

/**
+ * enum dma_hint - device driver dma hint
+ * Device drivers may provide hints for whether dma is
+ * intended for kernel driver, user driver, not not required.
+ */
+enum dma_hint {
+ DMA_FOR_KERNEL,
+ DMA_FOR_USER,
+ DMA_FOR_NONE,
+};
+
+/**
* struct device_driver - The basic device driver structure
* @name: Name of the device driver.
* @bus: The bus which the device of this driver belongs to.
@@ -101,6 +112,7 @@ struct device_driver {

bool suppress_bind_attrs; /* disables bind/unbind via sysfs */
enum probe_type probe_type;
+ enum dma_type dma_type;

const struct of_device_id *of_match_table;
const struct acpi_device_id *acpi_match_table;

Thanks
Kevin