Re: Crash during resume of pcie bridge due to infinite loop in ACPICA
From: Rafael J. Wysocki
Date: Tue Dec 02 2025 - 14:00:18 EST
On Fri, Nov 28, 2025 at 9:47 PM Bert Karwatzki <spasswolf@xxxxxx> wrote:
>
> This is not an ACPICA problem after all:
>
> I did some more monitoring:
> https://gitlab.freedesktop.org/spasswolf/linux-stable/-/commits/amdgpu_suspend_resume?ref_type=heads
> and I still get a crash, but perhaps due to the delays the printk()s caused I actually get a helpful error message in netconsole:
>
> T5971;ACPI BIOS Error (bug): Could not resolve symbol [\x5cM013.VARR], AE_NOT_FOUND (20240827/psargs-332)
> T5971;acpi_ps_complete_op returned 0x5
> T5971;acpi_ps_parse_aml_debug: parse loop returned = 0x5
> T5971;ACPI Error: Aborting method \x5cM013 due to previous error (AE_NOT_FOUND) (20240827/psparse-935)
> T5971;ACPI Error: Aborting method \x5cM017 due to previous error (AE_NOT_FOUND) (20240827/psparse-935)
> T5971;ACPI Error: Aborting method \x5cM019 due to previous error (AE_NOT_FOUND) (20240827/psparse-935)
> T5971;ACPI Error: Aborting method \x5c_SB.PCI0.GPP0.M439 due to previous error (AE_NOT_FOUND) (20240827/psparse-935)
> T5971;ACPI Error: Aborting method \x5c_SB.PCI0.GPP0.M241 due to previous error (AE_NOT_FOUND) (20240827/psparse-935)
> T5971;ACPI Error: Aborting method \x5c_SB.PCI0.GPP0.M237._ON due to previous error (AE_NOT_FOUND) (20240827/psparse-935)
> T5971;acpi_ps_parse_aml_debug: after walk loop
> T5971;acpi_ps_execute_method_debug 331
> T5971;acpi_ns_evaluate_debug 475 METHOD
> T5971;acpi_evaluate_object_debug 255
> T5971;__acpi_power_on_debug 369
> T5971;acpi_power_on_unlocked_debug 442
> T5971;acpi_power_on_unlocked_debug 446
> T5971;acpi_power_on_debug 471
> T5971;acpi_power_on_list_debug 649: result = -19
> T5971;pcieport 0000:00:01.1: pci_pm_default_resume_early 568#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
> T5971;pcieport 0000:00:01.1: broken device, retraining non-functional downstream link at 2.5GT/s#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
> T5971;pcieport 0000:00:01.1: retraining failed#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
> T5971;pcieport 0000:00:01.1: Data Link Layer Link Active not set in 1000 msec#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:00:01.1
> T5971;pcieport 0000:01:00.0: Unable to change power state from D3cold to D0, device inaccessible#012 SUBSYSTEM=pci#012 DEVICE=+pci:0000:01:00.0
>
> This shows that there seems to be no problem with ACPICA, and acpi_power_on_list(_debug)() returns -ENODEV,
> the crash occurs later.
>
> This leaves two question:
> 1. Is this crash avoidable by different error handling in the pci code?
> 2. If the crash is not avoidable, can we at least modify the error handling in such a way that
> we get an error message through netconsole by default? (perhaps a little delay will suffice)
I'm not sure how far this is going to get you, but you may try the
attached patch.
---
drivers/pci/pci-driver.c | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -555,11 +555,16 @@ static void pci_pm_default_resume(struct
pci_enable_wake(pci_dev, PCI_D0, false);
}
-static void pci_pm_default_resume_early(struct pci_dev *pci_dev)
+static int pci_pm_default_resume_early(struct pci_dev *pci_dev)
{
pci_pm_power_up_and_verify_state(pci_dev);
+ /* Bail out if the device is not accessible. */
+ if (pci_dev->current_state == PCI_D3cold)
+ return -ENODEV;
+
pci_restore_state(pci_dev);
pci_pme_restore(pci_dev);
+ return 0;
}
static void pci_pm_bridge_power_up_actions(struct pci_dev *pci_dev)
@@ -958,8 +963,11 @@ static int pci_pm_resume_noirq(struct de
* configuration here and attempting to put them into D0 again is
* pointless, so avoid doing that.
*/
- if (!(skip_bus_pm && pm_suspend_no_platform()))
- pci_pm_default_resume_early(pci_dev);
+ if (!(skip_bus_pm && pm_suspend_no_platform())) {
+ int error = pci_pm_default_resume_early(pci_dev);
+ if (error)
+ return error;
+ }
pci_fixup_device(pci_fixup_resume_early, pci_dev);
pcie_pme_root_status_cleanup(pci_dev);
@@ -1221,8 +1229,12 @@ static int pci_pm_restore_noirq(struct d
{
struct pci_dev *pci_dev = to_pci_dev(dev);
const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+ int error;
+
+ error = pci_pm_default_resume_early(pci_dev);
+ if (error)
+ return error;
- pci_pm_default_resume_early(pci_dev);
pci_fixup_device(pci_fixup_resume_early, pci_dev);
if (pci_has_legacy_pm_support(pci_dev))
@@ -1339,14 +1351,17 @@ static int pci_pm_runtime_resume(struct
struct pci_dev *pci_dev = to_pci_dev(dev);
const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
pci_power_t prev_state = pci_dev->current_state;
- int error = 0;
+ int error;
/*
* Restoring config space is necessary even if the device is not bound
* to a driver because although we left it in D0, it may have gone to
* D3cold when the bridge above it runtime suspended.
*/
- pci_pm_default_resume_early(pci_dev);
+ error = pci_pm_default_resume_early(pci_dev);
+ if (error)
+ return error;
+
pci_resume_ptm(pci_dev);
if (!pci_dev->driver)