Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops
From: Matthew Rosato
Date: Thu Sep 01 2022 - 12:15:01 EST
On 9/1/22 6:25 AM, Robin Murphy wrote:
> On 2022-08-31 21:12, Matthew Rosato wrote:
>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU
>> domains and the DMA API handling. However, this commit does not
>> sufficiently handle the case where the device is released via a call
>> to the release_device op as it may occur at the same time as an opposing
>> attach_dev or detach_dev since the group mutex is not held over
>> release_device. This was observed if the device is deconfigured during a
>> small window during vfio-pci initialization and can result in WARNs and
>> potential kernel panics.
>
> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all?
>
> Robin.
So, I generally have seen the issue manifest as one of the calls into the iommu core from __vfio_group_unset_container (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. This happens when the vfio group fd is released, which could be coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. AFAICT there's nothing serializing the notion of calling into the iommu core here against a device that is simultaneously going through release_device (because we don't enter release_device with the group mutex held), resulting in unpredictable behavior between the dueling attach_dev/detach_dev and the release_device for s390-iommu at least.
>
>> Handle this by tracking when the device is probed/released via
>> dev_iommu_priv_set/get(). Ensure that once the device is released only
>> release_device handles the re-init of the device DMA.
>>
>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
>> Signed-off-by: Matthew Rosato <mjrosato@xxxxxxxxxxxxx>
>> ---
>> arch/s390/include/asm/pci.h | 1 +
>> arch/s390/pci/pci.c | 1 +
>> drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++---
>> 3 files changed, 38 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
>> index 7b4cdadbc023..080251e7b275 100644
>> --- a/arch/s390/include/asm/pci.h
>> +++ b/arch/s390/include/asm/pci.h
>> @@ -157,6 +157,7 @@ struct zpci_dev {
>> /* DMA stuff */
>> unsigned long *dma_table;
>> spinlock_t dma_table_lock;
>> + struct mutex dma_domain_lock; /* protects s390_domain value */
>> int tlb_refresh;
>> spinlock_t iommu_bitmap_lock;
>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>> index 73cdc5539384..973edd32ecc9 100644
>> --- a/arch/s390/pci/pci.c
>> +++ b/arch/s390/pci/pci.c
>> @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
>> kref_init(&zdev->kref);
>> mutex_init(&zdev->lock);
>> mutex_init(&zdev->kzdev_lock);
>> + mutex_init(&zdev->dma_domain_lock);
>> rc = zpci_init_iommu(zdev);
>> if (rc)
>> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
>> index c898bcbbce11..1137d669e849 100644
>> --- a/drivers/iommu/s390-iommu.c
>> +++ b/drivers/iommu/s390-iommu.c
>> @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>> if (!domain_device)
>> return -ENOMEM;
>> + /* Leave now if the device has already been released */
>> + mutex_lock(&zdev->dma_domain_lock);
>> + if (!dev_iommu_priv_get(dev)) {
>> + mutex_unlock(&zdev->dma_domain_lock);
>> + kfree(domain_device);
>> + return 0;
>> + }
>> +
>> if (zdev->dma_table && !zdev->s390_domain) {
>> cc = zpci_dma_exit_device(zdev);
>> if (cc) {
>> @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>> goto out_restore;
>> }
>> domain_device->zdev = zdev;
>> - zdev->s390_domain = s390_domain;
>> list_add(&domain_device->list, &s390_domain->devices);
>> spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>> + zdev->s390_domain = s390_domain;
>> + mutex_unlock(&zdev->dma_domain_lock);
>> return 0;
>> @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>> virt_to_phys(zdev->dma_table));
>> }
>> out_free:
>> + mutex_unlock(&zdev->dma_domain_lock);
>> kfree(domain_device);
>> return rc;
>> @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
>> }
>> spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>> - if (found && (zdev->s390_domain == s390_domain)) {
>> + mutex_lock(&zdev->dma_domain_lock);
>> + if (found && (zdev->s390_domain == s390_domain) &&
>> + dev_iommu_priv_get(dev)) {
>> zdev->s390_domain = NULL;
>> zpci_unregister_ioat(zdev, 0);
>> zpci_dma_init_device(zdev);
>> }
>> + mutex_unlock(&zdev->dma_domain_lock);
>> }
>> static struct iommu_device *s390_iommu_probe_device(struct device *dev)
>> {
>> struct zpci_dev *zdev = to_zpci_dev(dev);
>> + dev_iommu_priv_set(dev, zdev);
>> +
>> return &zdev->iommu_dev;
>> }
>> @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev)
>> *
>> * So let's call detach_dev from here if it hasn't been called before.
>> */
>> - if (zdev && zdev->s390_domain) {
>> + if (zdev) {
>> + /*
>> + * Clear priv to block further attaches for this device,
>> + * ensure detaches don't init DMA. Hold the domain lock
>> + * to ensure that attach/detach get a consistent view of
>> + * whether or not the device is released.
>> + */
>> + mutex_lock(&zdev->dma_domain_lock);
>> + dev_iommu_priv_set(dev, NULL);
>> + mutex_unlock(&zdev->dma_domain_lock);
>> + /* Make sure this device is removed from the domain list */
>> domain = iommu_get_domain_for_dev(dev);
>> if (domain)
>> s390_iommu_detach_device(domain, dev);
>> + /* Now ensure DMA is initialized from here */
>> + mutex_lock(&zdev->dma_domain_lock);
>> + if (zdev->s390_domain) {
>> + zdev->s390_domain = NULL;
>> + zpci_unregister_ioat(zdev, 0);
>> + zpci_dma_init_device(zdev);
>> + }
>> + mutex_unlock(&zdev->dma_domain_lock);
>> }
>> }
>>