[PATCH v1 1/2] iommu: Do not call pci_dev_reset_iommu_done() unless reset succeeds
From: Nicolin Chen
Date: Thu Mar 05 2026 - 00:24:42 EST
IOMMU drivers handle ATC cache maintenance. They may encounter ATC-related
errors (e.g., ATC invalidation request timeout), which are typically sent
to the driver's ISR. To recover from such errors, the driver would need to
initiate a device reset procedure (I/O waiting) in an asynchronous thread.
If somehow the reset procedure fails, the ATC will be out of sync with the
OS, since the memory is already unmmaped and could be even re-assigned. In
this case, the device must be kept in the resetting domain, to prevent any
memory corruption.
Yet, currently pci_dev_reset_iommu_done() is called unconditionally:
IOMMU recovery thread():
pci_reset_function():
pci_dev_reset_iommu_prepare(); // Block RID/ATS
__reset(); // Failed (ATC is still stale)
pci_dev_reset_iommu_done(); // Unblock RID/ATS (ah-ha)
The simplest fix is to use pci_dev_reset_iommu_done() only on a successful
reset:
IOMMU recovery thread():
pci_reset_function():
pci_dev_reset_iommu_prepare(); // Block RID/ATS
if (!__reset())
pci_dev_reset_iommu_done(); // Unblock RID/ATS
else
// keep the device blocked by IOMMU
However, this breaks the symmetric requirement of these reset APIs so that
we have to allow a re-entry to pass a second reset attempt:
IOMMU recovery thread():
pci_reset_function():
pci_dev_reset_iommu_prepare(); // Block RID/ATS
__reset(); // Failed (ATC is still stale)
// Keep the device blocked by IOMMU
...
Another thread():
pci_reset_function():
pci_dev_reset_iommu_prepare(); // Re-entry (!)
Update the function kdocs and all the existing callers to only unblock ATS
when the reset succeeds. Drop the WARN_ON in pci_dev_reset_iommu_prepare()
to allow re-entries.
Signed-off-by: Nicolin Chen <nicolinc@xxxxxxxxxx>
---
drivers/iommu/iommu.c | 16 +++++++++-----
drivers/pci/pci-acpi.c | 11 +++++++++-
drivers/pci/pci.c | 50 +++++++++++++++++++++++++++++++++++++-----
drivers/pci/quirks.c | 11 +++++++++-
4 files changed, 75 insertions(+), 13 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 35db517809540..40a15c9360bd1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3938,8 +3938,10 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL");
* IOMMU activity while leaving the group->domain pointer intact. Later when the
* reset is finished, pci_dev_reset_iommu_done() can restore everything.
*
- * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done()
- * before/after the core-level reset routine, to unset the resetting_domain.
+ * Caller must use pci_dev_reset_iommu_done() after a successful PCI-level reset
+ * to unset the resetting_domain. If the reset fails, caller can choose to keep
+ * the device in the resetting_domain to protect system memory using IOMMU from
+ * any bad ATS.
*
* Return: 0 on success or negative error code if the preparation failed.
*
@@ -3961,9 +3963,9 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
guard(mutex)(&group->mutex);
- /* Re-entry is not allowed */
- if (WARN_ON(group->resetting_domain))
- return -EBUSY;
+ /* Already prepared */
+ if (group->resetting_domain)
+ return 0;
ret = __iommu_group_alloc_blocking_domain(group);
if (ret)
@@ -4001,7 +4003,9 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare);
* re-attaching all RID/PASID of the device's back to the domains retained in
* the core-level structure.
*
- * Caller must pair it with a successful pci_dev_reset_iommu_prepare().
+ * This is a pairing function for pci_dev_reset_iommu_prepare(). Caller should
+ * use it on a successful PCI-level reset. Otherwise, it's suggested for caller
+ * to keep the device in the resetting_domain to protect system memory.
*
* Note that, although unlikely, there is a risk that re-attaching domains might
* fail due to some unexpected happening like OOM.
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 4d0f2cb6c695b..f1a918938242c 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -16,6 +16,7 @@
#include <linux/pci_hotplug.h>
#include <linux/module.h>
#include <linux/pci-acpi.h>
+#include <linux/pci-ats.h>
#include <linux/pci-ecam.h>
#include <linux/pm_runtime.h>
#include <linux/pm_qos.h>
@@ -977,7 +978,15 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
ret = -ENOTTY;
}
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!ret || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return ret;
}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8479c2e1f74f1..80c5cf6eeebdc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4358,7 +4358,15 @@ int pcie_flr(struct pci_dev *dev)
ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
done:
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!ret || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return ret;
}
EXPORT_SYMBOL_GPL(pcie_flr);
@@ -4436,7 +4444,15 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
done:
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!ret || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return ret;
}
@@ -4490,7 +4506,15 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
pci_dev_d3_sleep(dev);
ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!ret || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return ret;
}
@@ -4933,7 +4957,15 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe)
rc = pci_parent_bus_reset(dev, probe);
done:
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!rc || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return rc;
}
@@ -4978,7 +5010,15 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
reg);
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!rc || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return rc;
}
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 48946cca4be72..d9a03a7772916 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -30,6 +30,7 @@
#include <linux/ktime.h>
#include <linux/mm.h>
#include <linux/nvme.h>
+#include <linux/pci-ats.h>
#include <linux/platform_data/x86/apple.h>
#include <linux/pm_runtime.h>
#include <linux/sizes.h>
@@ -4269,7 +4270,15 @@ static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe,
}
ret = i->reset(dev, probe);
- pci_dev_reset_iommu_done(dev);
+ /*
+ * The reset might be invoked to recover a serious error. E.g. when the
+ * ATC failed to invalidate its stale entries, which can result in data
+ * corruption. Thus, do not unblock ATS until a successful reset.
+ */
+ if (!ret || !pci_ats_supported(dev))
+ pci_dev_reset_iommu_done(dev);
+ else
+ pci_warn(dev, "Reset failed. Blocking ATS to protect memory\n");
return ret;
}
--
2.43.0