On 9/1/22 6:25 AM, Robin Murphy wrote:
On 2022-08-31 21:12, Matthew Rosato wrote:
With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
calls") s390-iommu is supposed to handle dynamic switching between IOMMU
domains and the DMA API handling. However, this commit does not
sufficiently handle the case where the device is released via a call
to the release_device op as it may occur at the same time as an opposing
attach_dev or detach_dev since the group mutex is not held over
release_device. This was observed if the device is deconfigured during a
small window during vfio-pci initialization and can result in WARNs and
potential kernel panics.
Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all?
Robin.
So, I generally have seen the issue manifest as one of the calls into the iommu core from __vfio_group_unset_container (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. This happens when the vfio group fd is released, which could be coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. AFAICT there's nothing serializing the notion of calling into the iommu core here against a device that is simultaneously going through release_device (because we don't enter release_device with the group mutex held), resulting in unpredictable behavior between the dueling attach_dev/detach_dev and the release_device for s390-iommu at least.
Handle this by tracking when the device is probed/released via
dev_iommu_priv_set/get(). Ensure that once the device is released only
release_device handles the re-init of the device DMA.
Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
Signed-off-by: Matthew Rosato <mjrosato@xxxxxxxxxxxxx>
---
arch/s390/include/asm/pci.h | 1 +
arch/s390/pci/pci.c | 1 +
drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++---
3 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 7b4cdadbc023..080251e7b275 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -157,6 +157,7 @@ struct zpci_dev {
/* DMA stuff */
unsigned long *dma_table;
spinlock_t dma_table_lock;
+ struct mutex dma_domain_lock; /* protects s390_domain value */
int tlb_refresh;
spinlock_t iommu_bitmap_lock;
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 73cdc5539384..973edd32ecc9 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
kref_init(&zdev->kref);
mutex_init(&zdev->lock);
mutex_init(&zdev->kzdev_lock);
+ mutex_init(&zdev->dma_domain_lock);
rc = zpci_init_iommu(zdev);
if (rc)
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index c898bcbbce11..1137d669e849 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
if (!domain_device)
return -ENOMEM;
+ /* Leave now if the device has already been released */
+ mutex_lock(&zdev->dma_domain_lock);
+ if (!dev_iommu_priv_get(dev)) {
+ mutex_unlock(&zdev->dma_domain_lock);
+ kfree(domain_device);
+ return 0;
+ }
+
if (zdev->dma_table && !zdev->s390_domain) {
cc = zpci_dma_exit_device(zdev);
if (cc) {
@@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
goto out_restore;
}
domain_device->zdev = zdev;
- zdev->s390_domain = s390_domain;
list_add(&domain_device->list, &s390_domain->devices);
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+ zdev->s390_domain = s390_domain;
+ mutex_unlock(&zdev->dma_domain_lock);
return 0;
@@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
virt_to_phys(zdev->dma_table));
}
out_free:
+ mutex_unlock(&zdev->dma_domain_lock);
kfree(domain_device);
return rc;
@@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
}
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- if (found && (zdev->s390_domain == s390_domain)) {
+ mutex_lock(&zdev->dma_domain_lock);
+ if (found && (zdev->s390_domain == s390_domain) &&
+ dev_iommu_priv_get(dev)) {
zdev->s390_domain = NULL;
zpci_unregister_ioat(zdev, 0);
zpci_dma_init_device(zdev);
}
+ mutex_unlock(&zdev->dma_domain_lock);
}
static struct iommu_device *s390_iommu_probe_device(struct device *dev)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
+ dev_iommu_priv_set(dev, zdev);
+
return &zdev->iommu_dev;
}
@@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev)
*
* So let's call detach_dev from here if it hasn't been called before.
*/
- if (zdev && zdev->s390_domain) {
+ if (zdev) {
+ /*
+ * Clear priv to block further attaches for this device,
+ * ensure detaches don't init DMA. Hold the domain lock
+ * to ensure that attach/detach get a consistent view of
+ * whether or not the device is released.
+ */
+ mutex_lock(&zdev->dma_domain_lock);
+ dev_iommu_priv_set(dev, NULL);
+ mutex_unlock(&zdev->dma_domain_lock);
+ /* Make sure this device is removed from the domain list */
domain = iommu_get_domain_for_dev(dev);
if (domain)
s390_iommu_detach_device(domain, dev);
+ /* Now ensure DMA is initialized from here */
+ mutex_lock(&zdev->dma_domain_lock);
+ if (zdev->s390_domain) {
+ zdev->s390_domain = NULL;
+ zpci_unregister_ioat(zdev, 0);
+ zpci_dma_init_device(zdev);
+ }
+ mutex_unlock(&zdev->dma_domain_lock);
}
}