[RFC PATCH 3/4] acpi: apei: Do not panic() in NMI because of GHES messages

From: Alexandru Gagniuc
Date: Tue Apr 03 2018 - 13:09:18 EST


BIOSes like to send NMIs for a number of silly reasons often deemed
to be "fatal". For example pin bounce during a PCIE hotplug/unplug
might cause the link to go down and retrain, with fatal PCI errors
being generated while the link is retraining.

Instead of panic()ing in NMI context, pass fatal errors down to IRQ
context to see if they can be resolved.

With these change, PCIe error are handled by AER. Other far less
common errors, such as machine check exceptions, still cause a panic()
in their respective handlers.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@xxxxxxxxx>
---
drivers/acpi/apei/ghes.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 2c998125b1d5..7243a99ea57e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -428,8 +428,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
* GHES_SEV_RECOVERABLE -> AER_NONFATAL
* GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
* These both need to be reported and recovered from by the AER driver.
- * GHES_SEV_PANIC does not make it to this handling since the kernel must
- * panic.
+ * GHES_SEV_PANIC -> AER_FATAL
*/
static bool ghes_handle_aer(struct acpi_hest_generic_data *gdata)
{
@@ -899,6 +898,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
struct ghes_estatus_node *estatus_node;
struct acpi_hest_generic *generic;
struct acpi_hest_generic_status *estatus;
+ int corrected_sev;
u32 len, node_len;

llnode = llist_del_all(&ghes_estatus_llist);
@@ -914,7 +914,14 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
len = cper_estatus_len(estatus);
node_len = GHES_ESTATUS_NODE_LEN(len);
- ghes_do_proc(estatus_node->ghes, estatus);
+ corrected_sev = ghes_do_proc(estatus_node->ghes, estatus);
+
+ if (corrected_sev >= GHES_SEV_PANIC) {
+ oops_begin();
+ ghes_print_queued_estatus();
+ __ghes_panic(estatus_node->ghes);
+ }
+
if (!ghes_estatus_cached(estatus)) {
generic = estatus_node->generic;
if (ghes_print_estatus(NULL, generic, estatus))
@@ -955,7 +962,7 @@ static void __process_error(struct ghes *ghes)
static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
{
struct ghes *ghes;
- int sev, ret = NMI_DONE;
+ int ret = NMI_DONE;

if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
return ret;
@@ -968,13 +975,6 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
ret = NMI_HANDLED;
}

- sev = ghes_severity(ghes->estatus->error_severity);
- if (sev >= GHES_SEV_PANIC) {
- oops_begin();
- ghes_print_queued_estatus();
- __ghes_panic(ghes);
- }
-
if (!(ghes->flags & GHES_TO_CLEAR))
continue;

--
2.14.3