[PATCH v17 09/11] cxl: Update Endpoint AER uncorrectable handler

From: Terry Bowman

Date: Tue May 05 2026 - 13:35:13 EST


The CXL cxl_core driver now implements protocol RAS support. PCI
uncorrectable (UCE) protocol errors, however, continue to be reported via
the AER capability and must still be handled by a PCI error recovery callback.
UCE handling is required to provide direction for recovery.

Replace the existing cxl_error_detected() callback in cxl/pci.c with a new
cxl_pci_error_detected() implementation that handles uncorrectable AER PCI
protocol errors.

The handler decides solely based on the pci_channel_state_t parameter and
does not access PCIe AER capability registers from .error_detected, matching
the pattern used by other drivers including the NVMe and ixgbe drivers.
CXL.cachemem-corrupting protocol errors are routed separately through the
AER-CXL kfifo to cxl_handle_proto_error(), so cxl_pci does not need to
second-guess the AER core's classification.

claude-opus-4.7 was used for research on PCI error state transitions and
requirements.

Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>

---

Changes in v16->v17:
- Rename pci_error_handlers struct instance to cxl_pci_error_handlers to
avoid shadowing the struct type tag.
- Restore scoped_guard(device) and dev->driver check around AER read.
- NULL-check find_cxl_port_by_dev() before deref of port->uport_dev.
- Updated commit message. (Terry)
- Add scope cleanup for port variable in cxl_pci_error_detected() (Terry)
- Drop cxl_uncor_aer_present(), rely on AER state

Changes in v15->v16:
- Update commit message (DaveJ)
- s/cxl_handle_aer()/cxl_uncor_aer_present()/g (Jonathan)
- cxl_uncor_aer_present(): Leave original result calculation based on
if a UCE is present and the provided state (Terry)
- Add call to pci_print_aer(). AER fails to log because is upstream
link (Terry)

Changes in v14->v15:
- Update commit message and title. Added Bjorn's ack.
- Move CE and UCE handling logic here

Changes in v13->v14:
- Add Dave Jiang's review-by
- Update commit message & headline (Bjorn)
- Refactor cxl_port_error_detected()/cxl_port_cor_error_detected() to
one line (Jonathan)
- Remove cxl_walk_port() (Dan)
- Remove cxl_pci_drv_bound(). Check for 'is_cxl' parent port is
sufficient (Dan)
- Remove device_lock_if()
- Combined CE and UCE here (Terry)

Changes in v12->v13:
- Move get_pci_cxl_host_dev() and cxl_handle_proto_error() to Dequeue
patch (Terry)
- Remove EP case in cxl_get_ras_base(), not used. (Terry)
- Remove check for dport->dport_dev (Dave)
- Remove whitespace (Terry)

Changes in v11->v12:
- Add call to cxl_pci_drv_bound() in cxl_handle_proto_error() and
pci_to_cxl_dev()
- Change cxl_error_detected() -> cxl_cor_error_detected()
- Remove NULL variable assignments
- Replace bus_find_device() with find_cxl_port_by_uport() for upstream
port searches.

Changes in v10->v11:
- None
---
drivers/cxl/core/ras.c | 43 ++++++++++++++++--------------------------
drivers/cxl/cxlpci.h | 8 ++++----
drivers/cxl/pci.c | 6 +++---
3 files changed, 23 insertions(+), 34 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 5cc4087c2807..a98ce0f412ad 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -253,38 +253,27 @@ bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
return true;
}

-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
{
- struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
- struct cxl_memdev *cxlmd = cxlds->cxlmd;
- struct device *dev = &cxlmd->dev;
- bool ue;
+ struct cxl_dport *dport;
+ struct cxl_port *port __free(put_cxl_port) =
+ find_cxl_port_by_dev(&pdev->dev, &dport);
+ struct cxl_memdev *cxlmd;
+ struct device *dev;

- scoped_guard(device, dev) {
- if (!dev->driver) {
- dev_warn(&pdev->dev,
- "%s: memdev disabled, abort error handling\n",
- dev_name(dev));
- return PCI_ERS_RESULT_DISCONNECT;
- }
+ if (!port)
+ return PCI_ERS_RESULT_DISCONNECT;

- /*
- * A frozen channel indicates an impending reset which is fatal to
- * CXL.mem operation, and will likely crash the system. On the off
- * chance the situation is recoverable dump the status of the RAS
- * capability registers and bounce the active state of the memdev.
- */
- ue = cxl_handle_ras(&cxlds->cxlmd->dev, pci_get_dsn(pdev),
- cxlmd->endpoint->regs.ras);
- }
+ cxlmd = to_cxl_memdev(port->uport_dev);
+ dev = &cxlmd->dev;

switch (state) {
case pci_channel_io_normal:
- if (ue) {
- device_release_driver(dev);
- return PCI_ERS_RESULT_NEED_RESET;
- }
+ /*
+ * Non-fatal CXL protocol errors are handled asynchronously
+ * by the AER-CXL kfifo worker (cxl_proto_err_work_fn).
+ */
return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
dev_warn(&pdev->dev,
@@ -299,7 +288,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
}
return PCI_ERS_RESULT_NEED_RESET;
}
-EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
+EXPORT_SYMBOL_NS_GPL(cxl_pci_error_detected, "CXL");

static void cxl_handle_proto_error(struct pci_dev *pdev, struct cxl_port *port,
struct cxl_dport *dport, int severity)
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 06c46adcf0f6..8aeb80a4e573 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -89,13 +89,13 @@ struct cxl_dev_state;
void read_cdat_data(struct cxl_port *port);

#ifdef CONFIG_CXL_RAS
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state);
+pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
void devm_cxl_port_ras_setup(struct cxl_port *port);
#else
-static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
+static inline pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
{
return PCI_ERS_RESULT_NONE;
}
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 5eb64ced0de5..6459f94f8fa8 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1000,8 +1000,8 @@ static void cxl_reset_done(struct pci_dev *pdev)
}
}

-static const struct pci_error_handlers cxl_error_handlers = {
- .error_detected = cxl_error_detected,
+static const struct pci_error_handlers cxl_pci_error_handlers = {
+ .error_detected = cxl_pci_error_detected,
.slot_reset = cxl_slot_reset,
.resume = cxl_error_resume,
.reset_done = cxl_reset_done,
@@ -1011,7 +1011,7 @@ static struct pci_driver cxl_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = cxl_mem_pci_tbl,
.probe = cxl_pci_probe,
- .err_handler = &cxl_error_handlers,
+ .err_handler = &cxl_pci_error_handlers,
.dev_groups = cxl_rcd_groups,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
--
2.34.1