[PATCH 5/5] cxl/pci: Log CXL device's PCIe AER and CXL RAS error information

From: Terry Bowman
Date: Fri Oct 21 2022 - 14:57:39 EST


The CXL downport PCIe AER and CXL RAS capability information needs to be
logged during PCIe AER error handling.

The existing PCIe AER error handler logs native AER errors but does not
log upport/downport AER capability residing in the RCRB. The CXL1.1
RCRB does not have a BDF and is not enunmerable. The existing error handler
logic does not display CXL RAS details either.

Add a CXL error handler to the existing PCI error handlers. Add a call
to the CXL error handler within the PCIe AER error handler. Implement the
driver's CXL callback to log downport PCIe AER and CXL RAS capability
information.

Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
---
drivers/cxl/pci.c | 76 ++++++++++++++++++++++++++++++++++++++++++
drivers/pci/pcie/aer.c | 45 ++++++++++++++++++++++++-
include/linux/pci.h | 4 +++
3 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 80a01b304efe..dceda9f9fc60 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -7,6 +7,7 @@
#include <linux/sizes.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/aer.h>
#include <linux/pci.h>
#include <linux/pci-doe.h>
#include <linux/io.h>
@@ -14,6 +15,9 @@
#include "cxlpci.h"
#include "cxl.h"

+extern void cxl_print_aer(struct pci_dev *dev, int aer_severity,
+ struct aer_capability_regs *aer);
+
/**
* DOC: cxl pci
*
@@ -744,9 +748,80 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM))
rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);

+ dev_set_drvdata(&pdev->dev, cxlmd);
+
return rc;
}

+struct ras_cap {
+ u32 uc_error_status;
+ u32 uc_error_mask;
+ u32 uc_error_severity;
+ u32 c_error_status;
+ u32 c_error_mask;
+ u32 ctrl;
+ u32 log[];
+};
+
+/*
+ * Log the state of the CXL downport AER and RAS status registers.
+ */
+static void cxl_error_report(struct cxl_memdev *cxlmd)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlmd->cxlds->dev);
+ struct aer_capability_regs *aer_cap;
+ struct ras_cap *ras_cap;
+
+ aer_cap = (struct aer_capability_regs *)cxlmd->cxlds->aer_map.base;
+ ras_cap = (struct ras_cap *)cxlmd->cxlds->ras_map.base;
+
+ pci_err(pdev, "CXL Error Report\n");
+ pci_err(pdev, "AER Errors:\n");
+ if (aer_cap) {
+ cxl_print_aer(pdev, AER_CORRECTABLE, aer_cap);
+ cxl_print_aer(pdev, AER_NONFATAL, aer_cap);
+ cxl_print_aer(pdev, AER_FATAL, aer_cap);
+ }
+
+ pci_err(pdev, "RAS Errors:\n");
+ if (ras_cap) {
+ pci_err(pdev, "RAS: uc_error_status = %X\n", readl(&ras_cap->uc_error_status));
+ pci_err(pdev, "RAS: uc_error_mask = %X\n", readl(&ras_cap->uc_error_mask));
+ pci_err(pdev, "RAS: uc_error_severity = %X\n", readl(&ras_cap->uc_error_severity));
+ pci_err(pdev, "RAS: c_error_status = %X\n", readl(&ras_cap->c_error_status));
+ pci_err(pdev, "RAS: c_error_mask = %X\n", readl(&ras_cap->c_error_mask));
+ pci_err(pdev, "RAS: ras_caps->ctrl = %X\n", readl(&ras_cap->ctrl));
+ pci_err(pdev, "RAS: log = %X\n", readl(&ras_cap->log));
+ }
+}
+
+static void cxl_error_detected(struct pci_dev *pdev)
+{
+ struct cxl_memdev *cxlmd;
+
+ if (!is_cxl_memdev(&pdev->dev)) {
+ pci_err(pdev, "CXL memory device is invalid\n");
+ return;
+ }
+
+ cxlmd = dev_get_drvdata(&pdev->dev);
+ if (!cxlmd) {
+ pci_err(pdev, "CXL memory device is NULL\n");
+ return;
+ }
+
+ if (!cxlmd->cxlds) {
+ pci_err(pdev, "CXL device state object is NULL\n");
+ return;
+ }
+
+ cxl_error_report(cxlmd);
+}
+
+static struct pci_error_handlers cxl_error_handlers = {
+ .cxl_error_detected = cxl_error_detected,
+};
+
static const struct pci_device_id cxl_mem_pci_tbl[] = {
/* PCI class code for CXL.mem Type-3 Devices */
{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
@@ -761,6 +836,7 @@ static struct pci_driver cxl_pci_driver = {
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
+ .err_handler = &cxl_error_handlers,
};

MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e2d8a74f83c3..dea04d412406 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -811,6 +811,13 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
}
#endif

+void cxl_print_aer(struct pci_dev *dev, int aer_severity,
+ struct aer_capability_regs *aer)
+{
+ cper_print_aer(dev, aer_severity, aer);
+}
+EXPORT_SYMBOL_GPL(cxl_print_aer);
+
/**
* add_error_device - list device to be handled
* @e_info: pointer to error info
@@ -1169,6 +1176,40 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
}
}

+static int report_cxl_errors_iter(struct pci_dev *pdev, void *data)
+{
+ struct pci_driver *pdrv = pdev->driver;
+
+ if (pdrv &&
+ pdrv->err_handler &&
+ pdrv->err_handler->cxl_error_detected)
+ pdrv->err_handler->cxl_error_detected(pdev);
+
+ return 0;
+}
+
+static void report_cxl_errors(struct aer_rpc *rpc,
+ struct aer_err_source *e_src)
+{
+ struct pci_dev *pdev = rpc->rpd;
+ struct aer_err_info e_info;
+ u32 uncor_status, cor_status;
+
+ pci_read_config_dword(pdev, pdev->aer_cap + PCI_ERR_UNCOR_STATUS, &uncor_status);
+ pci_read_config_dword(pdev, pdev->aer_cap + PCI_ERR_COR_STATUS, &cor_status);
+
+ if (!uncor_status && !cor_status)
+ return;
+
+ if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_EC)
+ pcie_walk_rcec(pdev, report_cxl_errors_iter, &e_info);
+ else
+ pci_walk_bus(pdev->subordinate, report_cxl_errors_iter, &e_info);
+
+ pci_write_config_dword(pdev, pdev->aer_cap + PCI_ERR_UNCOR_STATUS, uncor_status);
+ pci_write_config_dword(pdev, pdev->aer_cap + PCI_ERR_COR_STATUS, cor_status);
+}
+
/**
* aer_isr - consume errors detected by root port
* @irq: IRQ assigned to Root Port
@@ -1185,8 +1226,10 @@ static irqreturn_t aer_isr(int irq, void *context)
if (kfifo_is_empty(&rpc->aer_fifo))
return IRQ_NONE;

- while (kfifo_get(&rpc->aer_fifo, &e_src))
+ while (kfifo_get(&rpc->aer_fifo, &e_src)) {
+ report_cxl_errors(rpc, &e_src);
aer_isr_one_error(rpc, &e_src);
+ }
return IRQ_HANDLED;
}

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2bda4a4e47e8..4f4b3a8f5454 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -827,6 +827,10 @@ enum pci_ers_result {

/* PCI bus error event callbacks */
struct pci_error_handlers {
+
+ /* CXL error detected on this device */
+ void (*cxl_error_detected)(struct pci_dev *dev);
+
/* PCI bus error detected on this device */
pci_ers_result_t (*error_detected)(struct pci_dev *dev,
pci_channel_state_t error);
--
2.34.1