[PATCH] i40e: Disable i40e PCIe AER on system reboot

From: Yue Zhao
Date: Thu Dec 26 2024 - 22:55:25 EST


Disable PCIe AER on the i40e device on system reboot on a limited
list of Dell PowerEdge systems. This prevents a fatal PCIe AER event
on the i40e device during the ACPI _PTS (prepare to sleep) method for
S5 on those systems. The _PTS is invoked by acpi_enter_sleep_state_prep()
as part of the kernel's reboot sequence as a result of commit
38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot").

We first noticed this abnormal reboot issue in tg3 device, and there
is a similar patch about disable PCIe AER to fix hardware error during
reboot. The hardware error in tg3 device has gone after we apply this
patch below.

https://lore.kernel.org/lkml/20241129203640.54492-1-lszubowi@xxxxxxxxxx/T/

So we try to disable PCIe AER on the i40e device in the similar way.

hardware crash dmesg log:

ACPI: PM: Preparing to enter system sleep state S5
{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 5
{1}[Hardware Error]: event severity: fatal
{1}[Hardware Error]: Error 0, type: fatal
{1}[Hardware Error]: section_type: PCIe error
{1}[Hardware Error]: port_type: 0, PCIe end point
{1}[Hardware Error]: version: 3.0
{1}[Hardware Error]: command: 0x0006, status: 0x0010
{1}[Hardware Error]: device_id: 0000:05:00.1
{1}[Hardware Error]: slot: 0
{1}[Hardware Error]: secondary_bus: 0x00
{1}[Hardware Error]: vendor_id: 0x8086, device_id: 0x1572
{1}[Hardware Error]: class_code: 020000
{1}[Hardware Error]: aer_uncor_status: 0x00100000, aer_uncor_mask: 0x00018000
{1}[Hardware Error]: aer_uncor_severity: 0x000ef030
{1}[Hardware Error]: TLP Header: 40000001 0000000f 90028090 00000000
Kernel panic - not syncing: Fatal hardware error!
Hardware name: Dell Inc. PowerEdge C4140/08Y2GR, BIOS 2.21.1 12/12/2023
Call Trace:
<NMI>
dump_stack_lvl+0x48/0x70
dump_stack+0x10/0x20
panic+0x1b4/0x3a0
__ghes_panic+0x6c/0x70
ghes_in_nmi_queue_one_entry.constprop.0+0x1ee/0x2c0
ghes_notify_nmi+0x5e/0xe0
nmi_handle+0x62/0x160
default_do_nmi+0x4c/0x150
exc_nmi+0x140/0x1f0
end_repeat_nmi+0x16/0x67
RIP: 0010:intel_idle_irq+0x70/0xf0
</NMI>
<TASK>
cpuidle_enter_state+0x91/0x6f0
cpuidle_enter+0x2e/0x50
call_cpuidle+0x23/0x60
cpuidle_idle_call+0x11d/0x190
do_idle+0x82/0xf0
cpu_startup_entry+0x2a/0x30
rest_init+0xc2/0xf0
arch_call_rest_init+0xe/0x30
start_kernel+0x34f/0x440
x86_64_start_reservations+0x18/0x30
x86_64_start_kernel+0xbf/0x110
secondary_startup_64_no_verify+0x18f/0x19b
</TASK>

Fixes: 38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot")
Signed-off-by: Yue Zhao <yue.zhao@xxxxxxxxxx>
---
drivers/net/ethernet/intel/i40e/i40e_main.c | 64 +++++++++++++++++++++
1 file changed, 64 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0e1d9e2fbf38..80e66e4e90f7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <net/pkt_cls.h>
#include <net/xdp_sock_drv.h>
+#include <linux/dmi.h>

/* Local includes */
#include "i40e.h"
@@ -16608,6 +16609,56 @@ static void i40e_pci_error_resume(struct pci_dev *pdev)
i40e_io_resume(pf);
}

+/* Systems where ACPI _PTS (Prepare To Sleep) S5 will result in a fatal
+ * PCIe AER event on the i40e device if the i40e device is not, or cannot
+ * be, powered down.
+ */
+static const struct dmi_system_id i40e_restart_aer_quirk_table[] = {
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge C4140"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R440"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R540"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R640"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R650"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R740"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R750"),
+ },
+ },
+ {}
+};
+
/**
* i40e_shutdown - PCI callback for shutting down
* @pdev: PCI device information struct
@@ -16654,6 +16705,19 @@ static void i40e_shutdown(struct pci_dev *pdev)
i40e_clear_interrupt_scheme(pf);
rtnl_unlock();

+ if (system_state == SYSTEM_RESTART &&
+ dmi_first_match(i40e_restart_aer_quirk_table) &&
+ pdev->current_state <= PCI_D3hot) {
+ /* Disable PCIe AER on the i40e to avoid a fatal
+ * error during this system restart.
+ */
+ pcie_capability_clear_word(pdev, PCI_EXP_DEVCTL,
+ PCI_EXP_DEVCTL_CERE |
+ PCI_EXP_DEVCTL_NFERE |
+ PCI_EXP_DEVCTL_FERE |
+ PCI_EXP_DEVCTL_URRE);
+ }
+
if (system_state == SYSTEM_POWER_OFF) {
pci_wake_from_d3(pdev, pf->wol_en);
pci_set_power_state(pdev, PCI_D3hot);
--
2.39.5 (Apple Git-154)