Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops

From: Matthew Rosato
Date: Thu Sep 01 2022 - 12:15:01 EST


On 9/1/22 6:25 AM, Robin Murphy wrote:
> On 2022-08-31 21:12, Matthew Rosato wrote:
>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU
>> domains and the DMA API handling.  However, this commit does not
>> sufficiently handle the case where the device is released via a call
>> to the release_device op as it may occur at the same time as an opposing
>> attach_dev or detach_dev since the group mutex is not held over
>> release_device.  This was observed if the device is deconfigured during a
>> small window during vfio-pci initialization and can result in WARNs and
>> potential kernel panics.
>
> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all?
>
> Robin.

So, I generally have seen the issue manifest as one of the calls into the iommu core from __vfio_group_unset_container (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. This happens when the vfio group fd is released, which could be coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. AFAICT there's nothing serializing the notion of calling into the iommu core here against a device that is simultaneously going through release_device (because we don't enter release_device with the group mutex held), resulting in unpredictable behavior between the dueling attach_dev/detach_dev and the release_device for s390-iommu at least.


>
>> Handle this by tracking when the device is probed/released via
>> dev_iommu_priv_set/get().  Ensure that once the device is released only
>> release_device handles the re-init of the device DMA.
>>
>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
>> Signed-off-by: Matthew Rosato <mjrosato@xxxxxxxxxxxxx>
>> ---
>>   arch/s390/include/asm/pci.h |  1 +
>>   arch/s390/pci/pci.c         |  1 +
>>   drivers/iommu/s390-iommu.c  | 39 ++++++++++++++++++++++++++++++++++---
>>   3 files changed, 38 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
>> index 7b4cdadbc023..080251e7b275 100644
>> --- a/arch/s390/include/asm/pci.h
>> +++ b/arch/s390/include/asm/pci.h
>> @@ -157,6 +157,7 @@ struct zpci_dev {
>>       /* DMA stuff */
>>       unsigned long    *dma_table;
>>       spinlock_t    dma_table_lock;
>> +    struct mutex    dma_domain_lock; /* protects s390_domain value */
>>       int        tlb_refresh;
>>         spinlock_t    iommu_bitmap_lock;
>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>> index 73cdc5539384..973edd32ecc9 100644
>> --- a/arch/s390/pci/pci.c
>> +++ b/arch/s390/pci/pci.c
>> @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
>>       kref_init(&zdev->kref);
>>       mutex_init(&zdev->lock);
>>       mutex_init(&zdev->kzdev_lock);
>> +    mutex_init(&zdev->dma_domain_lock);
>>         rc = zpci_init_iommu(zdev);
>>       if (rc)
>> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
>> index c898bcbbce11..1137d669e849 100644
>> --- a/drivers/iommu/s390-iommu.c
>> +++ b/drivers/iommu/s390-iommu.c
>> @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>>       if (!domain_device)
>>           return -ENOMEM;
>>   +    /* Leave now if the device has already been released */
>> +    mutex_lock(&zdev->dma_domain_lock);
>> +    if (!dev_iommu_priv_get(dev)) {
>> +        mutex_unlock(&zdev->dma_domain_lock);
>> +        kfree(domain_device);
>> +        return 0;
>> +    }
>> +
>>       if (zdev->dma_table && !zdev->s390_domain) {
>>           cc = zpci_dma_exit_device(zdev);
>>           if (cc) {
>> @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>>           goto out_restore;
>>       }
>>       domain_device->zdev = zdev;
>> -    zdev->s390_domain = s390_domain;
>>       list_add(&domain_device->list, &s390_domain->devices);
>>       spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>> +    zdev->s390_domain = s390_domain;
>> +    mutex_unlock(&zdev->dma_domain_lock);
>>         return 0;
>>   @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>>                      virt_to_phys(zdev->dma_table));
>>       }
>>   out_free:
>> +    mutex_unlock(&zdev->dma_domain_lock);
>>       kfree(domain_device);
>>         return rc;
>> @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
>>       }
>>       spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>>   -    if (found && (zdev->s390_domain == s390_domain)) {
>> +    mutex_lock(&zdev->dma_domain_lock);
>> +    if (found && (zdev->s390_domain == s390_domain) &&
>> +        dev_iommu_priv_get(dev)) {
>>           zdev->s390_domain = NULL;
>>           zpci_unregister_ioat(zdev, 0);
>>           zpci_dma_init_device(zdev);
>>       }
>> +    mutex_unlock(&zdev->dma_domain_lock);
>>   }
>>     static struct iommu_device *s390_iommu_probe_device(struct device *dev)
>>   {
>>       struct zpci_dev *zdev = to_zpci_dev(dev);
>>   +    dev_iommu_priv_set(dev, zdev);
>> +
>>       return &zdev->iommu_dev;
>>   }
>>   @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev)
>>        *
>>        * So let's call detach_dev from here if it hasn't been called before.
>>        */
>> -    if (zdev && zdev->s390_domain) {
>> +    if (zdev) {
>> +        /*
>> +         * Clear priv to block further attaches for this device,
>> +         * ensure detaches don't init DMA.  Hold the domain lock
>> +         * to ensure that attach/detach get a consistent view of
>> +         * whether or not the device is released.
>> +         */
>> +        mutex_lock(&zdev->dma_domain_lock);
>> +        dev_iommu_priv_set(dev, NULL);
>> +        mutex_unlock(&zdev->dma_domain_lock);
>> +        /* Make sure this device is removed from the domain list */
>>           domain = iommu_get_domain_for_dev(dev);
>>           if (domain)
>>               s390_iommu_detach_device(domain, dev);
>> +        /* Now ensure DMA is initialized from here */
>> +        mutex_lock(&zdev->dma_domain_lock);
>> +        if (zdev->s390_domain) {
>> +            zdev->s390_domain = NULL;
>> +            zpci_unregister_ioat(zdev, 0);
>> +            zpci_dma_init_device(zdev);
>> +        }
>> +        mutex_unlock(&zdev->dma_domain_lock);
>>       }
>>   }
>>