Re: [PATCH v17 07/11] PCI/CXL: Add RCH support to CXL handlers
From: Dave Jiang
Date: Tue May 05 2026 - 20:00:33 EST
On 5/5/26 10:30 AM, Terry Bowman wrote:
> Restricted CXL Host (RCH) error handling is a separate path from the
> new CXL Port error handling flow. Fold RCH error handling into the
> Port flow so both share a common entry point.
>
> Update cxl_rch_handle_error_iter() to forward RCH protocol errors
> through the AER-CXL kfifo.
>
> Update cxl_handle_proto_error() to dispatch RCH errors via
> cxl_handle_rdport_errors(). cxl_handle_rdport_errors() handles both
> correctable and uncorrectable RCH protocol errors.
>
> Behavior change: an RCD uncorrectable CXL RAS error now panics via
> cxl_do_recovery(). Before this patch the RCH path returned
> PCI_ERS_RESULT_NEED_RESET via cxl_pci's err_handler. After this patch
> the same condition panics. This matches the panic policy added in the
> common CXL Port protocol error flow. CXL.cachemem traffic cannot be
> safely recovered from an uncorrectable protocol error in software.
>
> Change cxl_handle_rdport_errors() to take a PCI device instead of a
> CXL device state, matching the new caller context. The error trace events
> emitted from this path now report device=<PCI BDF> instead of device=<memN>,
> matching the rest of the unified CXL trace events. Userspace consumers keyed
> off the memdev name need to map the PCI BDF back to a memdev.
>
> Include the RCD Endpoint serial number in RCH log messages so the RCH
> can be associated with its RCD.
>
> Remove the cxlds->rcd check from cxl_cor_error_detected() and
> cxl_error_detected(). RCH errors are now forwarded by
> cxl_rch_handle_error_iter() through the AER-CXL kfifo to
> cxl_handle_proto_error(), so cxl_pci's err_handler no longer sees
> them.
>
> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>
> ---
>
> Changes in v16->v17:
> - Drop now-dead cxlds->rcd branches from cxl_{cor_,}error_detected().
> - Drop duplicate subject line from commit body.
> - Document panic-on-uncorrectable behavior change for RCD path.
> - Document trace event device-name change (memN -> PCI BDF) for RCH path.
> - Rewrite cxl_handle_proto_error() RC_END comment to clarify RCD/RCH shared
> interrupt relationship
> - Rewrite commit message
>
> Changes in v16:
> - New commit
> ---
> drivers/cxl/core/core.h | 4 ++--
> drivers/cxl/core/ras.c | 14 +++++++++-----
> drivers/cxl/core/ras_rch.c | 8 +++-----
> drivers/pci/pcie/aer_cxl_rch.c | 17 +----------------
> 4 files changed, 15 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index bc36cd1575a4..2c7387506dfb 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -188,7 +188,7 @@ void cxl_handle_cor_ras(struct device *dev, u64 serial,
> void __iomem *ras_base);
> void cxl_dport_map_rch_aer(struct cxl_dport *dport);
> void cxl_disable_rch_root_ints(struct cxl_dport *dport);
> -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
> +void cxl_handle_rdport_errors(struct pci_dev *pdev);
> void devm_cxl_dport_ras_setup(struct cxl_dport *dport);
> #else
> static inline int cxl_ras_init(void)
> @@ -205,7 +205,7 @@ static inline void cxl_handle_cor_ras(struct device *dev, u64 serial,
> void __iomem *ras_base) { }
> static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { }
> static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { }
> -static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
> +static inline void cxl_handle_rdport_errors(struct pci_dev *pdev) { }
> static inline void devm_cxl_dport_ras_setup(struct cxl_dport *dport) { }
> #endif /* CONFIG_CXL_RAS */
>
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 0a552d5a236e..1f1dd20623f6 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -267,9 +267,6 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
> return;
> }
>
> - if (cxlds->rcd)
> - cxl_handle_rdport_errors(cxlds);
> -
> cxl_handle_cor_ras(&cxlds->cxlmd->dev, pci_get_dsn(pdev),
> cxlmd->endpoint->regs.ras);
> }
> @@ -292,8 +289,6 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> return PCI_ERS_RESULT_DISCONNECT;
> }
>
> - if (cxlds->rcd)
> - cxl_handle_rdport_errors(cxlds);
> /*
> * A frozen channel indicates an impending reset which is fatal to
> * CXL.mem operation, and will likely crash the system. On the off
> @@ -329,6 +324,15 @@ EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
> static void cxl_handle_proto_error(struct pci_dev *pdev, struct cxl_port *port,
> struct cxl_dport *dport, int severity)
> {
> + /*
> + * An RC_END device is an RCD (Restricted CXL Device). Its AER
> + * interrupt is shared with the RCH Downstream Port, so handle RCH
> + * Downstream Port protocol errors first before processing the RCD's
> + * own errors. See CXL spec r3.1 s12.2.
> + */
> + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)
May as well use is_cxl_restricted(pdev).
DJ
> + cxl_handle_rdport_errors(pdev);
> +
> if (severity == AER_CORRECTABLE) {
> cxl_handle_cor_ras(&pdev->dev, pci_get_dsn(pdev),
> to_ras_base(port, dport));
> diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c
> index 61835fbafc0f..cbd02cabefbc 100644
> --- a/drivers/cxl/core/ras_rch.c
> +++ b/drivers/cxl/core/ras_rch.c
> @@ -1,7 +1,6 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright(c) 2025 AMD Corporation. All rights reserved. */
>
> -#include <linux/types.h>
> #include <linux/aer.h>
> #include "cxl.h"
> #include "core.h"
> @@ -95,9 +94,8 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
> return false;
> }
>
> -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
> +void cxl_handle_rdport_errors(struct pci_dev *pdev)
> {
> - struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> struct aer_capability_regs aer_regs;
> struct cxl_dport *dport;
> int severity;
> @@ -115,9 +113,9 @@ void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
>
> pci_print_aer(pdev, severity, &aer_regs);
> if (severity == AER_CORRECTABLE)
> - cxl_handle_cor_ras(&cxlds->cxlmd->dev, pci_get_dsn(pdev),
> + cxl_handle_cor_ras(&pdev->dev, pci_get_dsn(pdev),
> dport->regs.ras);
> else
> - cxl_handle_ras(&cxlds->cxlmd->dev, pci_get_dsn(pdev),
> + cxl_handle_ras(&pdev->dev, pci_get_dsn(pdev),
> dport->regs.ras);
> }
> diff --git a/drivers/pci/pcie/aer_cxl_rch.c b/drivers/pci/pcie/aer_cxl_rch.c
> index e471eefec9c4..83142eac0cab 100644
> --- a/drivers/pci/pcie/aer_cxl_rch.c
> +++ b/drivers/pci/pcie/aer_cxl_rch.c
> @@ -37,26 +37,11 @@ static bool cxl_error_is_native(struct pci_dev *dev)
> static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
> {
> struct aer_err_info *info = (struct aer_err_info *)data;
> - const struct pci_error_handlers *err_handler;
>
> if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
> return 0;
>
> - guard(device)(&dev->dev);
> -
> - err_handler = dev->driver ? dev->driver->err_handler : NULL;
> - if (!err_handler)
> - return 0;
> -
> - if (info->severity == AER_CORRECTABLE) {
> - if (err_handler->cor_error_detected)
> - err_handler->cor_error_detected(dev);
> - } else if (err_handler->error_detected) {
> - if (info->severity == AER_NONFATAL)
> - err_handler->error_detected(dev, pci_channel_io_normal);
> - else if (info->severity == AER_FATAL)
> - err_handler->error_detected(dev, pci_channel_io_frozen);
> - }
> + cxl_forward_error(dev, info);
> return 0;
> }
>