Re: [PATCH v16 07/10] cxl: Update error handlers to support CXL Port devices

From: Bowman, Terry

Date: Mon Mar 30 2026 - 12:35:49 EST


On 3/29/2026 8:07 PM, Dan Williams wrote:
> Terry Bowman wrote:
>> CXL Protocol trace logging is called for Endpoints in cxl_handle_ras() and
>> cxl_handle_cor_ras(). Trace logging support for CXL Port devices is missing.
>>
>> CXL Endpoint trace logging utilizes a separate trace routine than CXL Port
>> device handling. Using is_cxl_memdev(), determine if the device is a CXL EP
>> or one of the CXL Port devices.
>>
>> Update cxl_handle_ras() and cxl_handle_cor_ras() to call the CXL Port trace
>> logging function. Change cxl_handle_ras() return values to be pci_ers_result_t
>> type.
>>
>> Check for invalid ras_base and add log messages if NULL.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>>
>> ---
>>
>> Changes in v15 -> v16:
>> - New commit
>> ---
>> drivers/cxl/core/core.h | 10 ++++++----
>> drivers/cxl/core/ras.c | 36 +++++++++++++++++++++++++-----------
>> 2 files changed, 31 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
>> index 76d2593e68c6..984cc37be186 100644
>> --- a/drivers/cxl/core/core.h
>> +++ b/drivers/cxl/core/core.h
>> @@ -6,6 +6,7 @@
>>
>> #include <cxl/mailbox.h>
>> #include <linux/rwsem.h>
>> +#include <linux/pci.h>
>>
>> extern const struct device_type cxl_nvdimm_bridge_type;
>> extern const struct device_type cxl_nvdimm_type;
>> @@ -181,7 +182,8 @@ static inline struct device *dport_to_host(struct cxl_dport *dport)
>> #ifdef CONFIG_CXL_RAS
>> int cxl_ras_init(void);
>> void cxl_ras_exit(void);
>> -bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base);
>> +pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial,
>> + void __iomem *ras_base);
>
> This change is unrelated to the conversion to call different tracepoint
> handlers.
>
>> void cxl_handle_cor_ras(struct device *dev, u64 serial,
>> void __iomem *ras_base);
>> void cxl_dport_map_rch_aer(struct cxl_dport *dport);
>> @@ -195,10 +197,10 @@ static inline int cxl_ras_init(void)
>> return 0;
>> }
>> static inline void cxl_ras_exit(void) { }
>> -static inline bool cxl_handle_ras(struct device *dev, u64 serial,
>> - void __iomem *ras_base)
>> +static inline pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial,
>> + void __iomem *ras_base)
>> {
>> - return false;
>> + return PCI_ERS_RESULT_NONE;
>> }
>> static inline void cxl_handle_cor_ras(struct device *dev, u64 serial,
>> void __iomem *ras_base) { }
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 48d3ef7cbb92..254144d19764 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -291,15 +291,22 @@ void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
>> void __iomem *addr;
>> u32 status;
>>
>> - if (!ras_base)
>> + if (!ras_base) {
>> + pr_err_ratelimited("%s: CXL RAS registers aren't mapped\n",
>> + dev_name(dev));
>
> What does this new error print have to do with trace logging?
>


The error print will be logged when the trace is absent. This is an indication
to the user why the expected trace is missing and will help in debugging where
there is no other details otherwise.


>> return;
>> + }
>>
>> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>> status = readl(addr);
>> - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
>> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>> + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK))
>> + return;
>
> No need for this thrash if the tracepoint is kept unified... more below.
>
>> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>> + if (is_cxl_memdev(dev))
>
> Wait, the whole point of patch 3 was to make this tracepoint generic.
>
> The format string in tracepoints are not supposed to be ABI. Incremental
> fixup below. Now, there is a difference between "not supposed to be ABI"
> and "whoops someone had a dependency". We can always come back and
> restore the old format string if someone screams. It would also be nice
> to delete trace_cxl_port_aer_correctable_error(). That one might be more
> dicey from an ABI regression standpoint, but worth it if you only need
> one tracepoint for all CXL protocol errors regardless of source.
>

Would you like that logic moved to the trace routine? The trace routine would
require additional changes to determine if the device is a PCI device. This is
required inorder to format the displayed strings. The Endpoint displays 'memdev'
which doesn't apply to non-Endpoint devices.

Terry

> ---
> drivers/cxl/core/trace.h | 14 +++++++-------
> drivers/cxl/core/ras.c | 25 ++++++-------------------
> 2 files changed, 13 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
> index 5f630543b720..70c9e65bfa08 100644
> --- a/drivers/cxl/core/trace.h
> +++ b/drivers/cxl/core/trace.h
> @@ -145,22 +145,22 @@ TRACE_EVENT(cxl_port_aer_correctable_error,
> );
>
> TRACE_EVENT(cxl_aer_correctable_error,
> - TP_PROTO(const struct device *cxlmd, u32 status, u64 serial),
> - TP_ARGS(cxlmd, status, serial),
> + TP_PROTO(const struct device *dev, u32 status, u64 serial),
> + TP_ARGS(dev, status, serial),
> TP_STRUCT__entry(
> - __string(memdev, dev_name(cxlmd))
> - __string(host, dev_name(cxlmd->parent))
> + __string(dev, dev_name(dev))
> + __string(host, dev_name(dev->parent))
> __field(u64, serial)
> __field(u32, status)
> ),
> TP_fast_assign(
> - __assign_str(memdev);
> + __assign_str(dev);
> __assign_str(host);
> __entry->serial = serial;
> __entry->status = status;
> ),
> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
> - __get_str(memdev), __get_str(host), __entry->serial,
> + TP_printk("device=%s host=%s serial=%lld: status: '%s'",
> + __get_str(dev), __get_str(host), __entry->serial,
> show_ce_errs(__entry->status)
> )
> );
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 254144d19764..7a1319ce0f84 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -291,22 +291,15 @@ void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
> void __iomem *addr;
> u32 status;
>
> - if (!ras_base) {
> - pr_err_ratelimited("%s: CXL RAS registers aren't mapped\n",
> - dev_name(dev));
> + if (!ras_base)
> return;
> - }
>
> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
> status = readl(addr);
> - if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK))
> - return;
> -
> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
> - if (is_cxl_memdev(dev))
> + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
> trace_cxl_aer_correctable_error(dev, status, serial);
> - else
> - trace_cxl_port_aer_correctable_error(dev, status);
> + }
> }
>
> /* CXL spec rev3.0 8.2.4.16.1 */
> @@ -338,11 +331,8 @@ cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
> u32 status;
> u32 fe;
>
> - if (!ras_base) {
> - pr_err_ratelimited("%s: CXL RAS registers aren't mapped\n",
> - dev_name(dev));
> + if (!ras_base)
> return PCI_ERS_RESULT_NONE;
> - }
>
> addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
> status = readl(addr);
> @@ -361,10 +351,7 @@ cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
> }
>
> header_log_copy(ras_base, hl);
> - if (is_cxl_memdev(dev))
> - trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
> - else
> - trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl);
> + trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>
> return PCI_ERS_RESULT_PANIC;