Re: [PATCH v17 10/11] PCI/CXL: Mask/Unmask CXL protocol errors

From: Bowman, Terry

Date: Mon May 11 2026 - 17:04:29 EST


On 5/6/2026 1:00 PM, Dave Jiang wrote:
>
>
> On 5/5/26 10:30 AM, Terry Bowman wrote:
>> CXL protocol errors are not enabled for all CXL devices after boot. They
>> must be enabled in order to process CXL protocol errors. Provide matching
>> teardown helpers so the masks are restored when a CXL Port or Downstream
>> Port goes away.
>>
>> Add pci_aer_mask_internal_errors() as the symmetric counterpart to
>> pci_aer_unmask_internal_errors() and export both for the cxl_core module.
>>
>> Introduce cxl_unmask_proto_interrupts() and cxl_mask_proto_interrupts()
>> in cxl_core to wrap the PCI helpers with the dev_is_pci() and
>> pcie_aer_is_native() gating CXL needs. Both helpers tolerate a NULL
>> @dev so teardown callers do not have to special-case it.
>>
>> Wire cxl_unmask_proto_interrupts() into the success path of
>> cxl_dport_map_ras() and devm_cxl_port_ras_setup() so the unmask only
>> runs when the RAS register block was actually mapped. Pair each unmask
>> with a devm_add_action_or_reset() registration of
>> cxl_mask_proto_interrupts() scoped to the cxl_port device. The mask is
>> then restored when the cxl_port device releases its devres. This
>> applies to Endpoints, Upstream Switch Ports, Downstream Switch Ports,
>> and Root Ports.
>>
>> Co-developed-by: Dan Williams <djbw@xxxxxxxxxx>
>> Signed-off-by: Dan Williams <djbw@xxxxxxxxxx>
>> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>
> Reviewed-by: Dave Jiang <dave.jiang@xxxxxxxxx>
>
> I do wonder if we should save the original mask values and write those back rather than blindly remask everything when we are done.
>
>

Hi Dave,

This is only masking/unmasking the internal error bit. The other mask bits are not
modified.

- Terry


>>
>> ---
>>
>> Changes in v16->v17:
>> - Drop redundant cxl_mask_proto_interrupts() calls from unregister_port()
>> and cxl_dport_remove(); the devres action registered alongside the unmask
>> is the sole mask path.
>> - Update title
>> - Remove unnecessary check for aer_capabilities
>> - Gate cxl_unmask_proto_interrupts() on pcie_aer_is_native()
>> - Add pci_aer_mask_internal_errors() and cxl_mask_proto_interrupts()
>> - Only unmask on successful cxl_map_component_regs()
>> - NULL-check @dev in cxl_{un,}mask_proto_interrupts()
>> - Drop static and declare in core/core.h
>>
>> Change in v15 -> v16:
>> - None
>>
>> Change in v14 -> v15:
>> - None
>>
>> Changes in v13->v14:
>> - Update commit title's prefix (Bjorn)
>>
>> Changes in v12->v13:
>> - Add dev and dev_is_pci() NULL checks in cxl_unmask_proto_interrupts() (Terry)
>> - Add Dave Jiang's and Ben's review-by
>>
>> Changes in v11->v12:
>> - None
>> ---
>> drivers/cxl/core/core.h | 4 +++
>> drivers/cxl/core/ras.c | 63 ++++++++++++++++++++++++++++++++++++++---
>> drivers/pci/pcie/aer.c | 25 ++++++++++++++++
>> include/linux/aer.h | 2 ++
>> 4 files changed, 90 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
>> index 2c7387506dfb..ff39985d363f 100644
>> --- a/drivers/cxl/core/core.h
>> +++ b/drivers/cxl/core/core.h
>> @@ -190,6 +190,8 @@ void cxl_dport_map_rch_aer(struct cxl_dport *dport);
>> void cxl_disable_rch_root_ints(struct cxl_dport *dport);
>> void cxl_handle_rdport_errors(struct pci_dev *pdev);
>> void devm_cxl_dport_ras_setup(struct cxl_dport *dport);
>> +void cxl_unmask_proto_interrupts(struct device *dev);
>> +void cxl_mask_proto_interrupts(struct device *dev);
>> #else
>> static inline int cxl_ras_init(void)
>> {
>> @@ -207,6 +209,8 @@ static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { }
>> static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { }
>> static inline void cxl_handle_rdport_errors(struct pci_dev *pdev) { }
>> static inline void devm_cxl_dport_ras_setup(struct cxl_dport *dport) { }
>> +static inline void cxl_unmask_proto_interrupts(struct device *dev) { }
>> +static inline void cxl_mask_proto_interrupts(struct device *dev) { }
>> #endif /* CONFIG_CXL_RAS */
>>
>> int cxl_gpf_port_setup(struct cxl_dport *dport);
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index a98ce0f412ad..b45e2b539b5f 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -66,16 +66,59 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>> }
>> static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>>
>> +void cxl_unmask_proto_interrupts(struct device *dev)
>> +{
>> + struct pci_dev *pdev;
>> +
>> + if (!dev || !dev_is_pci(dev))
>> + return;
>> +
>> + pdev = to_pci_dev(dev);
>> + if (!pcie_aer_is_native(pdev))
>> + return;
>> +
>> + pci_aer_unmask_internal_errors(pdev);
>> +}
>> +
>> +void cxl_mask_proto_interrupts(struct device *dev)
>> +{
>> + struct pci_dev *pdev;
>> +
>> + if (!dev || !dev_is_pci(dev))
>> + return;
>> +
>> + pdev = to_pci_dev(dev);
>> + if (!pcie_aer_is_native(pdev))
>> + return;
>> +
>> + pci_aer_mask_internal_errors(pdev);
>> +}
>> +
>> +static void cxl_mask_proto_irqs(void *dev)
>> +{
>> + cxl_mask_proto_interrupts(dev);
>> +}
>> +
>> static void cxl_dport_map_ras(struct cxl_dport *dport)
>> {
>> struct cxl_register_map *map = &dport->reg_map;
>> struct device *dev = dport->dport_dev;
>>
>> - if (!map->component_map.ras.valid)
>> + if (!map->component_map.ras.valid) {
>> dev_dbg(dev, "RAS registers not found\n");
>> - else if (cxl_map_component_regs(map, &dport->regs.component,
>> - BIT(CXL_CM_CAP_CAP_ID_RAS)))
>> + return;
>> + }
>> +
>> + if (cxl_map_component_regs(map, &dport->regs.component,
>> + BIT(CXL_CM_CAP_CAP_ID_RAS))) {
>> dev_dbg(dev, "Failed to map RAS capability.\n");
>> + return;
>> + }
>> +
>> + cxl_unmask_proto_interrupts(dev);
>> + if (devm_add_action_or_reset(dport_to_host(dport),
>> + cxl_mask_proto_irqs, dev))
>> + dev_warn(dev, "failed to register CXL proto-irq mask cleanup\n");
>> }
>>
>> /**
>> @@ -109,6 +152,7 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_rch_ras_setup, "CXL");
>> void devm_cxl_port_ras_setup(struct cxl_port *port)
>> {
>> struct cxl_register_map *map = &port->reg_map;
>> + struct device *dev;
>>
>> if (!map->component_map.ras.valid) {
>> dev_dbg(&port->dev, "RAS registers not found\n");
>> @@ -117,8 +161,19 @@ void devm_cxl_port_ras_setup(struct cxl_port *port)
>>
>> map->host = &port->dev;
>> if (cxl_map_component_regs(map, &port->regs,
>> - BIT(CXL_CM_CAP_CAP_ID_RAS)))
>> + BIT(CXL_CM_CAP_CAP_ID_RAS))) {
>> dev_dbg(&port->dev, "Failed to map RAS capability\n");
>> + return;
>> + }
>> +
>> + dev = is_cxl_endpoint(port) ? port->uport_dev->parent : port->uport_dev;
>> + if (!dev_is_pci(dev))
>> + return;
>> +
>> + cxl_unmask_proto_interrupts(dev);
>> + if (devm_add_action_or_reset(&port->dev, cxl_mask_proto_irqs, dev))
>> + dev_warn(&port->dev,
>> + "Failed to register CXL proto-irq mask cleanup\n");
>> }
>> EXPORT_SYMBOL_NS_GPL(devm_cxl_port_ras_setup, "CXL");
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index b9c6c7b97217..eaa36fe0eb31 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -1151,6 +1151,31 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>> */
>> EXPORT_SYMBOL_FOR_MODULES(pci_aer_unmask_internal_errors, "cxl_core");
>>
>> +/**
>> + * pci_aer_mask_internal_errors - mask internal errors
>> + * @dev: pointer to the pci_dev data structure
>> + *
>> + * Mask internal errors in the Uncorrectable and Correctable Error
>> + * Mask registers.
>> + *
>> + * Note: AER must be enabled and supported by the device which must be
>> + * checked in advance, e.g. with pcie_aer_is_native().
>> + */
>> +void pci_aer_mask_internal_errors(struct pci_dev *dev)
>> +{
>> + int aer = dev->aer_cap;
>> + u32 mask;
>> +
>> + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
>> + mask |= PCI_ERR_UNC_INTN;
>> + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
>> +
>> + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
>> + mask |= PCI_ERR_COR_INTERNAL;
>> + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
>> +}
>> +EXPORT_SYMBOL_FOR_MODULES(pci_aer_mask_internal_errors, "cxl_core");
>> +
>> /**
>> * pci_aer_handle_error - handle logging error into an event log
>> * @dev: pointer to pci_dev data structure of error source device
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index 979ed2f9fd38..c52db62d4c7e 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -71,6 +71,7 @@ int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>> void pci_aer_clear_fatal_status(struct pci_dev *dev);
>> int pcie_aer_is_native(struct pci_dev *dev);
>> void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>> +void pci_aer_mask_internal_errors(struct pci_dev *dev);
>> #else
>> static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>> {
>> @@ -79,6 +80,7 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>> static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>> static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>> static inline void pci_aer_unmask_internal_errors(struct pci_dev *dev) { }
>> +static inline void pci_aer_mask_internal_errors(struct pci_dev *dev) { }
>> #endif
>>
>> #ifdef CONFIG_CXL_RAS
>