Re: [PATCH v8 02/16] PCI/AER: Modify AER driver logging to report CXL or PCIe bus error type

From: Bowman, Terry
Date: Thu Mar 27 2025 - 13:15:40 EST




On 3/27/2025 11:48 AM, Bjorn Helgaas wrote:
> On Wed, Mar 26, 2025 at 08:47:03PM -0500, Terry Bowman wrote:
>> The AER service driver and aer_event tracing currently log 'PCIe Bus Type'
>> for all errors. Update the driver and aer_event tracing to log 'CXL Bus
>> Type' for CXL device errors.
>>
>> This requires the AER can identify and distinguish between PCIe errors and
>> CXL errors.
>>
>> Introduce boolean 'is_cxl' to 'struct aer_err_info'. Add assignment in
>> aer_get_device_error_info() and pci_print_aer().
>>
>> Update the aer_event trace routine to accept a bus type string parameter.
>> +++ b/drivers/pci/pci.h
>> @@ -533,6 +533,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
>> struct aer_err_info {
>> struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
>> int error_dev_num;
>> + bool is_cxl;
>>
>> unsigned int id:16;
>>
>> @@ -549,6 +550,11 @@ struct aer_err_info {
>> struct pcie_tlp_log tlp; /* TLP Header */
>> };
>>
>> +static inline const char *aer_err_bus(struct aer_err_info *info)
>> +{
>> + return info->is_cxl ? "CXL" : "PCIe";
> I don't really see the point in adding struct aer_err_info.is_cxl.
> Every place where we call aer_err_bus() to look at it, we also have
> the struct pci_dev pointer, so we could just as easily use
> pcie_is_cxl() here.

Hi Bjorn,

My understanding is Dan wanted the decorated aer_err_info::is_cxl to capture
the type of error (CXL or PCIe) and cache it because it could change. That is,
the CXL device's alternate protocol could transition down before handling but
the CXL driver must keep knowledge of the original state for reporting. But, the
alternate protocol transition is not currently detected and used to update
pci_dev::is_cxl. pci_dev::is_cxl is currently only assigned during device creation
at the moment.

DanW, please add detail and/or correct me.

Regards,
Terry
>> +}
>> +
>> int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
>> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 508474e17183..83f2069f111e 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev,
>>
>> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>> {
>> + const char *bus_type = aer_err_bus(info);
>> int layer, agent;
>> int id = pci_dev_id(dev);
>> const char *level;
>>
>> if (!info->status) {
>> - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>> - aer_error_severity_string[info->severity]);
>> + pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>> + bus_type, aer_error_severity_string[info->severity]);
>> goto out;
>> }
>>
>> @@ -709,8 +710,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>>
>> level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
>>
>> - pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
>> - aer_error_severity_string[info->severity],
>> + pci_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n",
>> + bus_type, aer_error_severity_string[info->severity],
>> aer_error_layer[layer], aer_agent_string[agent]);
>>
>> pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
>> @@ -725,7 +726,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>> if (info->id && info->error_dev_num > 1 && info->id == id)
>> pci_err(dev, " Error of this Agent is reported first\n");
>>
>> - trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
>> + trace_aer_event(dev_name(&dev->dev), bus_type, (info->status & ~info->mask),
>> info->severity, info->tlp_header_valid, &info->tlp);
>> }
>>
>> @@ -759,6 +760,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>> void pci_print_aer(struct pci_dev *dev, int aer_severity,
>> struct aer_capability_regs *aer)
>> {
>> + const char *bus_type;
>> int layer, agent, tlp_header_valid = 0;
>> u32 status, mask;
>> struct aer_err_info info;
>> @@ -780,6 +782,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>> info.status = status;
>> info.mask = mask;
>> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>> + info.is_cxl = pcie_is_cxl(dev);
>> +
>> + bus_type = aer_err_bus(&info);
>>
>> pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>> __aer_print_error(dev, &info);
>> @@ -793,7 +798,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>> if (tlp_header_valid)
>> pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
>>
>> - trace_aer_event(dev_name(&dev->dev), (status & ~mask),
>> + trace_aer_event(dev_name(&dev->dev), bus_type, (status & ~mask),
>> aer_severity, tlp_header_valid, &aer->header_log);
>> }
>> EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
>> @@ -1211,6 +1216,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
>> /* Must reset in this function */
>> info->status = 0;
>> info->tlp_header_valid = 0;
>> + info->is_cxl = pcie_is_cxl(dev);
>>
>> /* The device might not support AER */
>> if (!aer)
>> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
>> index e5f7ee0864e7..1bf8e7050ba8 100644
>> --- a/include/ras/ras_event.h
>> +++ b/include/ras/ras_event.h
>> @@ -297,15 +297,17 @@ TRACE_EVENT(non_standard_event,
>>
>> TRACE_EVENT(aer_event,
>> TP_PROTO(const char *dev_name,
>> + const char *bus_type,
>> const u32 status,
>> const u8 severity,
>> const u8 tlp_header_valid,
>> struct pcie_tlp_log *tlp),
>>
>> - TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
>> + TP_ARGS(dev_name, bus_type, status, severity, tlp_header_valid, tlp),
>>
>> TP_STRUCT__entry(
>> __string( dev_name, dev_name )
>> + __string( bus_type, bus_type )
>> __field( u32, status )
>> __field( u8, severity )
>> __field( u8, tlp_header_valid)
>> @@ -314,6 +316,7 @@ TRACE_EVENT(aer_event,
>>
>> TP_fast_assign(
>> __assign_str(dev_name);
>> + __assign_str(bus_type);
>> __entry->status = status;
>> __entry->severity = severity;
>> __entry->tlp_header_valid = tlp_header_valid;
>> @@ -325,8 +328,8 @@ TRACE_EVENT(aer_event,
>> }
>> ),
>>
>> - TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
>> - __get_str(dev_name),
>> + TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n",
>> + __get_str(dev_name), __get_str(bus_type),
>> __entry->severity == AER_CORRECTABLE ? "Corrected" :
>> __entry->severity == AER_FATAL ?
>> "Fatal" : "Uncorrected, non-fatal",
>> --
>> 2.34.1
>>