RE: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports
From: Shiju Jose
Date: Fri Jun 27 2025 - 08:22:57 EST
>-----Original Message-----
>From: Terry Bowman <terry.bowman@xxxxxxx>
>Sent: 26 June 2025 23:43
>To: dave@xxxxxxxxxxxx; Jonathan Cameron <jonathan.cameron@xxxxxxxxxx>;
>dave.jiang@xxxxxxxxx; alison.schofield@xxxxxxxxx; dan.j.williams@xxxxxxxxx;
>bhelgaas@xxxxxxxxxx; Shiju Jose <shiju.jose@xxxxxxxxxx>;
>ming.li@xxxxxxxxxxxx; Smita.KoralahalliChannabasappa@xxxxxxx;
>rrichter@xxxxxxx; dan.carpenter@xxxxxxxxxx;
>PradeepVineshReddy.Kodamati@xxxxxxx; lukas@xxxxxxxxx;
>Benjamin.Cheatham@xxxxxxx;
>sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx; terry.bowman@xxxxxxx;
>linux-cxl@xxxxxxxxxxxxxxx
>Cc: linux-kernel@xxxxxxxxxxxxxxx; linux-pci@xxxxxxxxxxxxxxx
>Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>and CXL Ports
>
>CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>errors. This is inconvenient for the user because they must enable
>2 sets of trace routines. Make updates to the trace logging such that a single
>trace routine logs both CXL Endpoint and CXL Port protocol errors.
>
>Keep the trace log fields 'memdev' and 'host'. While these are not accurate for
>non-Endpoints the fields will remain as-is to prevent breaking userspace RAS
>trace consumers.
>
>Add serial number parameter to the trace logging. This is used for EPs and 0 is
>provided for CXL port devices without a serial number.
>
>Below is output of correctable and uncorrectable protocol error logging.
>CXL Root Port and CXL Endpoint examples are included below.
>
>Root Port:
>cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>Error'
>
>Endpoint:
>cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status:
>'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>
>Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>---
> drivers/cxl/core/pci.c | 19 ++++-----
> drivers/cxl/core/ras.c | 14 ++++---
> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
> 3 files changed, 37 insertions(+), 80 deletions(-)
>
[...]
>
> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>*data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>25ebfbc1616c..494d6db461a7 100644
>--- a/drivers/cxl/core/trace.h
>+++ b/drivers/cxl/core/trace.h
>@@ -48,49 +48,22 @@
> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
> )
>
>-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>- TP_ARGS(dev, status, fe, hl),
>- TP_STRUCT__entry(
>- __string(device, dev_name(dev))
>- __string(host, dev_name(dev->parent))
>- __field(u32, status)
>- __field(u32, first_error)
>- __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>- ),
>- TP_fast_assign(
>- __assign_str(device);
>- __assign_str(host);
>- __entry->status = status;
>- __entry->first_error = fe;
>- /*
>- * Embed the 512B headerlog data for user app retrieval and
>- * parsing, but no need to print this in the trace buffer.
>- */
>- memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>- ),
>- TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>- __get_str(device), __get_str(host),
>- show_uc_errs(__entry->status),
>- show_uc_errs(__entry->first_error)
>- )
>-);
>-
> TRACE_EVENT(cxl_aer_uncorrectable_error,
>- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>*hl),
>- TP_ARGS(cxlmd, status, fe, hl),
>+ TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>+ u32 *hl),
>+ TP_ARGS(dev, serial, status, fe, hl),
> TP_STRUCT__entry(
>- __string(memdev, dev_name(&cxlmd->dev))
>- __string(host, dev_name(cxlmd->dev.parent))
>+ __string(name, dev_name(dev))
>+ __string(parent, dev_name(dev->parent))
Hi Terry,
Thanks for considering the feedback given in v9 regarding the compatibility issue
with the rasdaemon.
https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@xxxxxxxxxx/
Probably some confusion w.r.t the feedback.
Unfortunately TP_printk(...) is not an ABI that we need to keep stable,
it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
> __field(u64, serial)
> __field(u32, status)
> __field(u32, first_error)
> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> ),
> TP_fast_assign(
>- __assign_str(memdev);
>- __assign_str(host);
>- __entry->serial = cxlmd->cxlds->serial;
>+ __assign_str(name);
>+ __assign_str(parent);
>+ __entry->serial = serial;
> __entry->status = status;
> __entry->first_error = fe;
> /*
>@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> */
> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> ),
>- TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>'%s'",
>- __get_str(memdev), __get_str(host), __entry->serial,
>+ TP_printk("memdev=%s host=%s serial=%lld status='%s'
>first_error='%s'",
>+ __get_str(name), __get_str(parent), __entry->serial,
> show_uc_errs(__entry->status),
> show_uc_errs(__entry->first_error)
> )
>@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>} \
> )
>
>-TRACE_EVENT(cxl_port_aer_correctable_error,
>- TP_PROTO(struct device *dev, u32 status),
>- TP_ARGS(dev, status),
>- TP_STRUCT__entry(
>- __string(device, dev_name(dev))
>- __string(host, dev_name(dev->parent))
>- __field(u32, status)
>- ),
>- TP_fast_assign(
>- __assign_str(device);
>- __assign_str(host);
>- __entry->status = status;
>- ),
>- TP_printk("device=%s host=%s status='%s'",
>- __get_str(device), __get_str(host),
>- show_ce_errs(__entry->status)
>- )
>-);
>-
> TRACE_EVENT(cxl_aer_correctable_error,
>- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>- TP_ARGS(cxlmd, status),
>+ TP_PROTO(struct device *dev, u64 serial, u32 status),
>+ TP_ARGS(dev, serial, status),
> TP_STRUCT__entry(
>- __string(memdev, dev_name(&cxlmd->dev))
>- __string(host, dev_name(cxlmd->dev.parent))
>+ __string(name, dev_name(dev))
>+ __string(parent, dev_name(dev->parent))
Same as above.
> __field(u64, serial)
> __field(u32, status)
> ),
> TP_fast_assign(
>- __assign_str(memdev);
>- __assign_str(host);
>- __entry->serial = cxlmd->cxlds->serial;
>+ __assign_str(name);
>+ __assign_str(parent);
>+ __entry->serial = serial;
> __entry->status = status;
> ),
>- TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>- __get_str(memdev), __get_str(host), __entry->serial,
>+ TP_printk("memdev=%s host=%s serial=%lld status='%s'",
>+ __get_str(name), __get_str(parent), __entry->serial,
> show_ce_errs(__entry->status)
> )
> );
>--
>2.34.1
Thanks,
Shiju