[PATCH v8] x86: mce: kexec: switch MCE handler for kexec/kdump

From: Naoya Horiguchi
Date: Tue Apr 07 2015 - 04:03:42 EST


kexec disables (or "shoots down") all CPUs other than a crashing CPU before
entering the 2nd kernel. But the MCE handler is still enabled after that,
so if MCE happens and broadcasts over the CPUs after the main thread starts
the 2nd kernel (which might not initialize its MCE handler yet, or might decide
not to enable it,) MCE handler runs only on the other CPUs (not on the main
thread,) leading to kernel panic with MCE synchronization. The user-visible
effect of this bug is kdump failure.

Our standard MCE handler do_machine_check() assumes some about system's
status and it's hard to alter it to cover kexec/kdump context, so let's add
another kdump-specific one and switch to it.

Note that this problem exists since current MCE handler was implemented in
2.6.32, and recently commit 716079f66eac ("mce: Panic when a core has reached
a timeout") made it more visible by changing the default behavior of the
synchronization timeout from "ignore" to "panic".

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
---
ChangeLog: v7 -> v8
- rebase onto tip/master (resolve conflict with mce_severity rework)
- fix printk messages and comments
- s/switch_mce_handler_for_kdump/kdump_setup_mce/g
- print out msg in non-fatal MCE case too

ChangeLog: v6 -> v7
- use rdmsrl() instead of rdmsrl_safe() with mca_cfg.disabled check

ChangeLog v5 -> v6:
- drop "CC stable" tag
- stop using/exporting mce_gather_info(), mce_(rd|wr)msrl(), and mce_panic()
- drop quirk_no_way_out() part, because quirk_sandybridge_ifu() (only possible
callback) could just change a MCE_PANIC_SEVERITY case to a MCE_AR_SEVERITY
case, which doesn't affect the panic/return decision.

ChangeLog v4 -> v5:
- drop MCE_UC/AR_SEVERITY re-ordering
- move most of code to arch/x86/kernel/crash.c
- export some MCE internal variables/routines via arch/x86/include/asm/mce.h

ChangeLog v3 -> v4:
- fixed AR and UC order in enum severity_level because UC is severer than AR
by definition. Current code is not affected by this wrong order by chance.
- check severity in machine_check_under_kdump(), and call mce_panic() if the
resultant severity is as bad as or worse than MCE_AR_SEVERITY.
- use static global variable kdump_cpu instead of mca_cfg->kdump_cpu
- reduce "#ifdef CONFIG_KEXEC"
- add "#ifdef CONFIG_X86_MCE" for declaration of machine_check_under_kdump()
in mce.h
- update comment on switch_mce_handler_for_kdump()

ChangeLog v2 -> v3
- go to "switch MCE handler" approach

ChangeLog v1 -> v2
- clear MSR_IA32_MCG_CTL, MSR_IA32_MCx_CTL, and CR4.MCE instead of using
global flag to ignore MCE events.
- fixed the description of the problem
---
arch/x86/include/asm/mce.h | 15 ++++++
arch/x86/kernel/cpu/mcheck/mce-internal.h | 13 -----
arch/x86/kernel/crash.c | 87 +++++++++++++++++++++++++++++++
3 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1f5a86d518db..d35ea7e8a764 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -255,4 +255,19 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);

+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg,
+ bool is_excp);
+
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fe32074b865b..47d1e5218fb5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,18 +1,6 @@
#include <linux/device.h>
#include <asm/mce.h>

-enum severity_level {
- MCE_NO_SEVERITY,
- MCE_DEFERRED_SEVERITY,
- MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
- MCE_KEEP_SEVERITY,
- MCE_SOME_SEVERITY,
- MCE_AO_SEVERITY,
- MCE_UC_SEVERITY,
- MCE_AR_SEVERITY,
- MCE_PANIC_SEVERITY,
-};
-
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */

@@ -24,7 +12,6 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};

-extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);

extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index b71ff16cbdb0..0b12eb5452f6 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -34,6 +34,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
+#include <asm/mce.h>

/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
@@ -76,6 +77,90 @@ struct crash_memmap_data {

int in_crash_kexec;

+#ifdef CONFIG_X86_MCE
+static int kdump_cpu;
+
+/*
+ * kdump-specific machine check handler
+ *
+ * When kexec/kdump is running, what the MCE handler is expected to do
+ * changes depending on whether the CPU is running the main thread or not.
+ *
+ * The crashing CPU, controlling the whole system exclusively, should try to
+ * get kdump as hard as possible even if an MCE happens concurrently, because
+ * some types of MCEs (for example, uncorrected errors like SRAO,)
+ * are not fatal or don't ruin reliablility of the kdump (consider that an
+ * MCE can hit the other CPU, in which case corrupted data is never consumed.)
+ * If an MCE critically breaks the kdump operation, we are unlucky so let's
+ * accept the fate of whatever HW causes, hoping a dying message reaches admins.
+ *
+ * The other CPUs are supposed to be quiet during kexec/kdump, so after the
+ * crashing CPU shot them down, they should not do anything except clearing
+ * MCG_STATUS (without this the system is reset, which is undesirable.)
+ * Note that this is also true after the crashing CPU enter the 2nd kernel.
+ */
+static void machine_check_under_kdump(struct pt_regs *regs, long error_code)
+{
+ struct mce m = {};
+ char *msg = NULL;
+ char *nmsg = NULL;
+ int i;
+ int worst = 0;
+ int severity;
+
+ if (mca_cfg.disabled)
+ return;
+ if (!mca_cfg.banks)
+ goto out;
+ if (kdump_cpu != smp_processor_id())
+ goto clear_mcg_status;
+
+ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ if (regs && m.mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+ m.cs = regs->cs;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ rdmsrl(MSR_IA32_MCx_STATUS(i), m.status);
+ severity = mce_severity(&m, mca_cfg.tolerant, &nmsg, true);
+ if (severity > worst) {
+ worst = severity;
+ msg = nmsg;
+ }
+ }
+
+ if (worst >= MCE_UC_SEVERITY)
+ panic("kdump: Fatal machine check: %s", msg);
+
+ pr_emerg("kdump: Non-fatal MCE detected: %s - kernel dump might be unreliable.\n", msg);
+
+clear_mcg_status:
+ wrmsrl(MSR_IA32_MCG_STATUS, 0);
+out:
+ sync_core();
+}
+
+/*
+ * Switch the MCE handler to kdump-specific one
+ *
+ * Standard MCE handler do_machine_check() is not designed for kexec/kdump
+ * context, where we can't expect MCE's recovering and logging to work fine
+ * because the kernel might be unstable (all CPUs except one must be idle.)
+ *
+ * In such situations, getting a kernel dump is more important than handling
+ * MCEs because what the users are really interested in is to find out what
+ * caused the crash.
+ *
+ * So let's switch MCE handler to the one suitable for kexec/kdump situation.
+ */
+void kdump_setup_mce(void)
+{
+ kdump_cpu = smp_processor_id();
+ machine_check_vector = machine_check_under_kdump;
+}
+#else
+static inline void kdump_setup_mce(void) {}
+#endif
+
/*
* This is used to VMCLEAR all VMCSs loaded on the
* processor. And when loading kvm_intel module, the
@@ -166,6 +251,8 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();

+ kdump_setup_mce();
+
kdump_nmi_shootdown_cpus();

/*
--
2.1.0
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/