[PATCH v3 02/11] iommu: Pass in reset result to pci_dev_reset_iommu_done()
From: Nicolin Chen
Date: Thu Apr 16 2026 - 19:32:44 EST
IOMMU drivers handle ATC cache maintenance. They may encounter ATC-related
errors (e.g., ATC invalidation request timeout), indicating that ATC cache
might have stale entries that can corrupt the memory. In this case, IOMMU
driver has no choice but to block the device's ATS function and wait for a
device recovery.
The pci_dev_reset_iommu_done() called at the end of a reset function could
serve as a reliable signal to the IOMMU subsystem that the physical device
cache is completely clean. However, the function is called unconditionally
even if the reset operation had actually failed, which would re-attach the
faulty device back to a normal translation domain. And this will leave the
system highly exposed, creating vulnerabilities for data corruption:
IOMMU blocks RID/ATS
pci_reset_function():
pci_dev_reset_iommu_prepare(); // Block RID/ATS
__reset(); // Failed (ATC is still stale)
pci_dev_reset_iommu_done(); // Unblock RID/ATS (ah-ha)
Instead, add a @reset_succeeds parameter to pci_dev_reset_iommu_done() and
pass the reset result from each caller:
IOMMU blocks RID/ATS
pci_reset_function():
pci_dev_reset_iommu_prepare(); // Block RID/ATS
rc = __reset();
pci_dev_reset_iommu_done(!rc); // Unblock or quarantine
On a successful reset, done() restores the device to its RID/PASID domains
and decrements group->recovery_cnt. On failure, the device remains blocked,
and concurrent domain attachment will be rejected until a successful reset.
Suggested-by: Kevin Tian <kevin.tian@xxxxxxxxx>
Signed-off-by: Nicolin Chen <nicolinc@xxxxxxxxxx>
---
include/linux/iommu.h | 5 +++--
drivers/iommu/iommu.c | 28 +++++++++++++++++++++++++---
drivers/pci/pci-acpi.c | 2 +-
drivers/pci/pci.c | 10 +++++-----
drivers/pci/quirks.c | 2 +-
5 files changed, 35 insertions(+), 12 deletions(-)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 54b8b48c762e8..d3685967e960a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1191,7 +1191,7 @@ void iommu_free_global_pasid(ioasid_t pasid);
/* PCI device reset functions */
int pci_dev_reset_iommu_prepare(struct pci_dev *pdev);
-void pci_dev_reset_iommu_done(struct pci_dev *pdev);
+void pci_dev_reset_iommu_done(struct pci_dev *pdev, bool reset_succeeds);
#else /* CONFIG_IOMMU_API */
struct iommu_ops {};
@@ -1521,7 +1521,8 @@ static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
return 0;
}
-static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev)
+static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev,
+ bool reset_succeeds)
{
}
#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ff181db687bbf..28d4c1f143a08 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -80,6 +80,7 @@ struct group_device {
* Device is blocked for a pending recovery while its group->domain is
* retained. This can happen when:
* - Device is undergoing a reset
+ * - Device failed the last reset
*/
bool blocked;
unsigned int reset_depth;
@@ -3971,7 +3972,9 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL");
* reset is finished, pci_dev_reset_iommu_done() can restore everything.
*
* Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done()
- * before/after the core-level reset routine, to decrement the recovery_cnt.
+ * before/after the core-level reset routine. On a successful reset, done() will
+ * decrement group->recovery_cnt and restore domains. On a failure, recovery_cnt
+ * is left intact and the device stays blocked.
*
* Return: 0 on success or negative error code if the preparation failed.
*
@@ -4000,6 +4003,9 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
if (gdev->reset_depth++)
return 0;
+ /* Device might be already blocked for a quarantine */
+ if (gdev->blocked)
+ return 0;
ret = __iommu_group_alloc_blocking_domain(group);
if (ret)
@@ -4047,18 +4053,22 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare);
/**
* pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done
* @pdev: PCI device that has finished a reset routine
+ * @reset_succeeds: Whether the PCI device reset is successful or not
*
* After a PCIe device finishes a reset routine, it wants to restore its IOMMU
* activity, including new translation and cache invalidation, by re-attaching
* all RID/PASID of the device back to the domains retained in the core-level
* structure.
*
- * Caller must pair it with a successful pci_dev_reset_iommu_prepare().
+ * This is a pairing function for pci_dev_reset_iommu_prepare(). Caller should
+ * pass in the reset state via @reset_succeeds. On a failed reset, the device
+ * remains blocked for a quarantine with the group->recovery_cnt intact, so as
+ * to protect system memory until a subsequent successful reset.
*
* Note that, although unlikely, there is a risk that re-attaching domains might
* fail due to some unexpected happening like OOM.
*/
-void pci_dev_reset_iommu_done(struct pci_dev *pdev)
+void pci_dev_reset_iommu_done(struct pci_dev *pdev, bool reset_succeeds)
{
struct iommu_group *group = pdev->dev.iommu_group;
struct group_device *gdev;
@@ -4083,6 +4093,18 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev)
if (WARN_ON(!group->blocking_domain))
return;
+ /*
+ * A reset failure implies that the device might be unreliable. E.g. its
+ * device cache might retain stale entries, which potentially results in
+ * memory corruption. Thus, do not unblock the device until a successful
+ * reset.
+ */
+ if (!reset_succeeds) {
+ pci_err(pdev,
+ "Reset failed. Keep it blocked to protect memory\n");
+ return;
+ }
+
/* Re-attach RID domain back to group->domain */
if (group->domain != group->blocking_domain) {
WARN_ON(__iommu_attach_device(group->domain, &pdev->dev,
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 4d0f2cb6c695b..9ffd7f013a7d4 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -977,7 +977,7 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
ret = -ENOTTY;
}
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !ret);
return ret;
}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8479c2e1f74f1..d78e724027c78 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4358,7 +4358,7 @@ int pcie_flr(struct pci_dev *dev)
ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
done:
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !ret);
return ret;
}
EXPORT_SYMBOL_GPL(pcie_flr);
@@ -4436,7 +4436,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
done:
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !ret);
return ret;
}
@@ -4490,7 +4490,7 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
pci_dev_d3_sleep(dev);
ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !ret);
return ret;
}
@@ -4933,7 +4933,7 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe)
rc = pci_parent_bus_reset(dev, probe);
done:
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !rc);
return rc;
}
@@ -4978,7 +4978,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
reg);
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !rc);
return rc;
}
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 05ce12b6b2f76..6ce79a25e5c76 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4271,7 +4271,7 @@ static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe,
}
ret = i->reset(dev, probe);
- pci_dev_reset_iommu_done(dev);
+ pci_dev_reset_iommu_done(dev, !ret);
return ret;
}
--
2.43.0