[PATCH v8 06/16] CXL/PCI: Introduce CXL uncorrectable protocol error 'recovery'

From: Terry Bowman
Date: Wed Mar 26 2025 - 21:53:04 EST


Create cxl_do_recovery() to provide uncorrectable protocol error (UCE)
handling. Follow similar design as found in PCIe error driver,
pcie_do_recovery(). One difference is that cxl_do_recovery() will treat all
UCEs as fatal with a kernel panic. This is to prevent corruption on CXL
memory.

Copy the PCIe error handlers merge_result(). Introduce PCI_ERS_RESULT_PANIC
and add support in the merge_result() routine.

Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the
first device in all cases.

Copy report_error_detected() to cxl_report_error_detected(). Update this
function to populate the CXL error information structure, 'struct
cxl_prot_error_info', before calling the device error handler.

Call panic() to halt the system in the case of uncorrectable errors (UCE)
in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use
if a UCE is not found. In this case the AER status must be cleared and
uses pci_aer_clear_fatal_status().

Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
---
drivers/cxl/core/ras.c | 92 +++++++++++++++++++++++++++++++++++++++++-
drivers/pci/pci.h | 2 -
include/linux/pci.h | 5 +++
3 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index eca8f11a05d9..1f94fc08e72b 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -141,7 +141,97 @@ int cxl_create_prot_err_info(struct pci_dev *_pdev, int severity,
}
EXPORT_SYMBOL_NS_GPL(cxl_create_prot_err_info, "CXL");

-static void cxl_do_recovery(struct pci_dev *pdev) { }
+
+static pci_ers_result_t merge_result(enum pci_ers_result orig,
+ enum pci_ers_result new)
+{
+ if (new == PCI_ERS_RESULT_PANIC)
+ return PCI_ERS_RESULT_PANIC;
+
+ if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
+ return PCI_ERS_RESULT_NO_AER_DRIVER;
+
+ if (new == PCI_ERS_RESULT_NONE)
+ return orig;
+
+ switch (orig) {
+ case PCI_ERS_RESULT_CAN_RECOVER:
+ case PCI_ERS_RESULT_RECOVERED:
+ orig = new;
+ break;
+ case PCI_ERS_RESULT_DISCONNECT:
+ if (new == PCI_ERS_RESULT_NEED_RESET)
+ orig = PCI_ERS_RESULT_NEED_RESET;
+ break;
+ default:
+ break;
+ }
+
+ return orig;
+}
+
+static void cxl_walk_bridge(struct pci_dev *bridge,
+ int (*cb)(struct pci_dev *, void *),
+ void *userdata)
+{
+ if (cb(bridge, userdata))
+ return;
+
+ if (bridge->subordinate)
+ pci_walk_bus(bridge->subordinate, cb, userdata);
+}
+
+
+static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
+{
+ struct cxl_driver *pdrv;
+ pci_ers_result_t vote, *result = data;
+ struct cxl_prot_error_info err_info = { 0 };
+ const struct cxl_error_handlers *cxl_err_handler;
+
+ if (cxl_create_prot_err_info(pdev, AER_FATAL, &err_info))
+ return 0;
+
+ struct device *dev __free(put_device) = get_device(err_info.dev);
+ if (!dev)
+ return 0;
+
+ pdrv = to_cxl_drv(dev->driver);
+ if (!pdrv || !pdrv->err_handler ||
+ !pdrv->err_handler->error_detected)
+ return 0;
+
+ cxl_err_handler = pdrv->err_handler;
+ vote = cxl_err_handler->error_detected(dev, &err_info);
+
+ *result = merge_result(*result, vote);
+
+ return 0;
+}
+
+static void cxl_do_recovery(struct pci_dev *pdev)
+{
+ struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus);
+ pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
+
+ cxl_walk_bridge(pdev, cxl_report_error_detected, &status);
+ if (status == PCI_ERS_RESULT_PANIC)
+ panic("CXL cachemem error.");
+
+ /*
+ * If we have native control of AER, clear error status in the device
+ * that detected the error. If the platform retained control of AER,
+ * it is responsible for clearing this status. In that case, the
+ * signaling device may not even be visible to the OS.
+ */
+ if (host->native_aer) {
+ pcie_clear_device_status(pdev);
+ pci_aer_clear_nonfatal_status(pdev);
+ pci_aer_clear_fatal_status(pdev);
+ }
+
+ pci_info(pdev, "CXL uncorrectable error.\n");
+}

static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
{
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index c32eab22c0b2..1354c7cfedeb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -886,7 +886,6 @@ void pci_no_aer(void);
void pci_aer_init(struct pci_dev *dev);
void pci_aer_exit(struct pci_dev *dev);
extern const struct attribute_group aer_stats_attr_group;
-void pci_aer_clear_fatal_status(struct pci_dev *dev);
int pci_aer_clear_status(struct pci_dev *dev);
int pci_aer_raw_clear_status(struct pci_dev *dev);
void pci_save_aer_state(struct pci_dev *dev);
@@ -895,7 +894,6 @@ void pci_restore_aer_state(struct pci_dev *dev);
static inline void pci_no_aer(void) { }
static inline void pci_aer_init(struct pci_dev *d) { }
static inline void pci_aer_exit(struct pci_dev *d) { }
-static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
static inline void pci_save_aer_state(struct pci_dev *dev) { }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 56015721be22..0aee5846b95c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -862,6 +862,9 @@ enum pci_ers_result {

/* No AER capabilities registered for the driver */
PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6,
+
+ /* System is unstable, panic */
+ PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7,
};

/* PCI bus error event callbacks */
@@ -1864,8 +1867,10 @@ static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; }

#ifdef CONFIG_PCIEAER
bool pci_aer_available(void);
+void pci_aer_clear_fatal_status(struct pci_dev *dev);
#else
static inline bool pci_aer_available(void) { return false; }
+void pci_aer_clear_fatal_status(struct pci_dev *dev) { };
#endif

bool pci_ats_disabled(void);
--
2.34.1