From 5fc7b1a9e0f0bcfa14068c6358019ed1e3ffc6c6 Mon Sep 17 00:00:00 2001 From: "dio.sun" Date: Wed, 26 Feb 2025 08:54:49 +0000 Subject: [PATCH] AER: PCIE CTO recovery handle fix - Non-fatal PCIe CTO is reportted to PCIE RC and it will be convertted to AdvNonFatalErr automatically - according to PCIE SPEC 6.2.3.2.4.4 Requester with Completion Timeout( If the severity of the CTO is non-fatal, and the Requester elects to attempt recovery by issuing a new request, the Requester must first handle the currecnt error case as an Advisory Non-Fatal Error.). - Current Kernel code does nothing when receiving an AdvNonFatalErr( Correctable Error) and the device driver has no chance to handle this error. - Under this situation, sometimes system will hang when more AdvNonFatalErr coming. Signed-off-by: dio.sun --- drivers/pci/pcie/aer.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 508474e17183..5ddc990c6f42 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1154,7 +1154,21 @@ static void aer_recover_work_func(struct work_struct *work) ghes_estatus_pool_region_free((unsigned long)entry.regs, sizeof(struct aer_capability_regs)); - if (entry.severity == AER_NONFATAL) + if (entry.severity == AER_CORRECTABLE) { + if (entry.regs->cor_status & PCI_ERR_COR_ADV_NFAT) { + pci_err(pdev, "%04x:%02x:%02x:%x advisory non-fatal error\n", + entry.domain, entry.bus, PCI_SLOT(entry.devfn), + PCI_FUNC(entry.devfn)); + if (entry.regs->uncor_status & PCI_ERR_UNC_COMP_TIME) { + pci_err(pdev, "%04x:%02x:%02x:%x completion timeout\n", + entry.domain, entry.bus, + PCI_SLOT(entry.devfn), + PCI_FUNC(entry.devfn)); + pcie_do_recovery(pdev, pci_channel_io_frozen, + aer_root_reset); + } + } + } else if (entry.severity == AER_NONFATAL) pcie_do_recovery(pdev, pci_channel_io_normal, aer_root_reset); else if (entry.severity == AER_FATAL) -- 2.37.3