Re: [PATCH v16 07/10] cxl: Update error handlers to support CXL Port devices
From: Bowman, Terry
Date: Wed Mar 11 2026 - 11:39:13 EST
On 3/9/2026 9:05 AM, Jonathan Cameron wrote:
> On Mon, 2 Mar 2026 14:36:45 -0600
> Terry Bowman <terry.bowman@xxxxxxx> wrote:
>
>> CXL Protocol trace logging is called for Endpoints in cxl_handle_ras() and
>> cxl_handle_cor_ras(). Trace logging support for CXL Port devices is missing.
>>
>> CXL Endpoint trace logging utilizes a separate trace routine than CXL Port
>> device handling. Using is_cxl_memdev(), determine if the device is a CXL EP
>> or one of the CXL Port devices.
>>
>> Update cxl_handle_ras() and cxl_handle_cor_ras() to call the CXL Port trace
>> logging function. Change cxl_handle_ras() return values to be pci_ers_result_t
>> type.
>
> Why this last bit?
>
You requested in previous review this should return a value more meaningful than bool.
I changed to return pci_ers_result_t.
https://lore.kernel.org/linux-cxl/20260205171346.00001e6b@xxxxxxxxxx/
>>
>> Check for invalid ras_base and add log messages if NULL.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>
> A few comments inline.
>
> Thanks,
>
> Jonathan
>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 48d3ef7cbb92..254144d19764 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -291,15 +291,22 @@ void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
>> void __iomem *addr;
>> u32 status;
>>
>> - if (!ras_base)
>> + if (!ras_base) {
>> + pr_err_ratelimited("%s: CXL RAS registers aren't mapped\n",
>> + dev_name(dev));
>
> This print isn't mentioned in the commit message. Probably needs some comment
> on why all paths that get here are error paths.
>
Good idea. I'll update with those details.
-Terry
>> return;
>> + }
>>
>> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>> status = readl(addr);
>> - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
>> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>> + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK))
>> + return;
>> +
>> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>> + if (is_cxl_memdev(dev))
>> trace_cxl_aer_correctable_error(dev, status, serial);
>> - }
>> + else
>> + trace_cxl_port_aer_correctable_error(dev, status);
>> }
>>
>> /* CXL spec rev3.0 8.2.4.16.1 */
>> @@ -321,22 +328,26 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
>>
>> /*
>> * Log the state of the RAS status registers and prepare them to log the
>> - * next error status. Return 1 if reset needed.
>> + * next error status. Return PCI_ERS_RESULT_PANIC if reset needed.
>
> This seems odd as normally PANIC implies more than reset. I guess system reset,
> kind of...
>
>> */
>> -bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
>> +pci_ers_result_t
>> +cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
>> {
>> u32 hl[CXL_HEADERLOG_SIZE_U32];
>> void __iomem *addr;
>> u32 status;
>> u32 fe;
>>
>> - if (!ras_base)
>> - return false;
>> + if (!ras_base) {
>> + pr_err_ratelimited("%s: CXL RAS registers aren't mapped\n",
>> + dev_name(dev));
>> + return PCI_ERS_RESULT_NONE;
>> + }
>>
>> addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>> status = readl(addr);
>> if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>> - return false;
>> + return PCI_ERS_RESULT_NONE;
>>
>> /* If multiple errors, log header points to first error from ctrl reg */
>> if (hweight32(status) > 1) {
>> @@ -350,10 +361,13 @@ bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
>> }
>>
>> header_log_copy(ras_base, hl);
>> - trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
>> + if (is_cxl_memdev(dev))
>> + trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
>> + else
>> + trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl);
>> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>>
>> - return true;
>> + return PCI_ERS_RESULT_PANIC;
>> }
>>
>> void cxl_cor_error_detected(struct pci_dev *pdev)
>