[PATCH v3 1/2] x86: mce: kexec: switch MCE handler for kexec/kdump

From: Naoya Horiguchi
Date: Tue Mar 03 2015 - 04:04:56 EST


kexec disables (or "shoots down") all CPUs other than a crashing CPU before
entering the 2nd kernel. But the MCE handler is still enabled after that,
so if MCE happens and broadcasts over the CPUs after the main thread starts
the 2nd kernel (which might not initialize MCE device yet, or might decide
not to enable it,) MCE handler runs only on the other CPUs (not on the main
thread,) leading to kernel panic with MCE synchronization. The user-visible
effect of this bug is kdump failure.

Our standard MCE handler do_machine_check() assumes some about system's
status and it's hard to alter it to cover kexec/kdump context, so let's add
another kdump-specific one and switch to it.

Note that this problem exists since current MCE handler was implemented in
2.6.32, and recently commit 716079f66eac ("mce: Panic when a core has reached
a timeout") made it more visible by changing the default behavior of the
synchronization timeout from "ignore" to "panic".

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx> [2.6.32+]
---
ChangeLog v2 -> v3
- go to "switch MCE handler" approach

ChangeLog v1 -> v2
- clear MSR_IA32_MCG_CTL, MSR_IA32_MCx_CTL, and CR4.MCE instead of using
global flag to ignore MCE events.
- fixed the description of the problem
---
arch/x86/include/asm/mce.h | 6 +++++
arch/x86/kernel/cpu/mcheck/mce.c | 47 ++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/crash.c | 3 +++
3 files changed, 56 insertions(+)

diff --git v3.19.orig/arch/x86/include/asm/mce.h v3.19/arch/x86/include/asm/mce.h
index 51b26e895933..8010d4b77183 100644
--- v3.19.orig/arch/x86/include/asm/mce.h
+++ v3.19/arch/x86/include/asm/mce.h
@@ -114,6 +114,9 @@ struct mca_config {
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
+#ifdef CONFIG_KEXEC
+ int kdump_cpu;
+#endif
};

extern struct mca_config mca_cfg;
@@ -175,6 +178,9 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif

int mce_available(struct cpuinfo_x86 *c);
+#ifdef CONFIG_KEXEC
+void switch_mce_handler_for_kdump(void);
+#endif

DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
diff --git v3.19.orig/arch/x86/kernel/cpu/mcheck/mce.c v3.19/arch/x86/kernel/cpu/mcheck/mce.c
index 3112b79ace8e..6e7730a72b79 100644
--- v3.19.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ v3.19/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1219,6 +1219,36 @@ void do_machine_check(struct pt_regs *regs, long error_code)
}
EXPORT_SYMBOL_GPL(do_machine_check);

+#ifdef CONFIG_KEXEC
+/*
+ * kdump-specific machine check handler
+ *
+ * When kexec/kdump is running, what the MCE handler is expected to do
+ * changes depending on whether the CPU is running the main thread or not.
+ *
+ * The crashing CPU, controlling the whole system exclusively, should try to
+ * get kdump as hard as possible even if an MCE happens concurrently, because
+ * some types of MCEs (for example, uncorrected errors like SRAO and SRAR,)
+ * are not fatal or don't ruin reliablility of the kdump (consider that an
+ * MCE can hit the other CPU, in which case corrupted data is never consumed.)
+ * If an MCE critically breaks the kdump operation, we are unlucky so let's
+ * accept the fate of whatever HW causes, hoping a dying message reaches admins.
+ *
+ * The other CPUs are supposed to be quiet during kexec/kdump, so after the
+ * crashing CPU shot them down, they should not do anything except clearing
+ * MCG_STATUS (without this the system is reset, which is undesirable.)
+ * Note that this is also true after the crashing CPU enter the 2nd kernel.
+ */
+static void machine_check_under_kdump(struct pt_regs *regs, long error_code)
+{
+ if (mca_cfg.kdump_cpu == smp_processor_id())
+ pr_emerg("MCE triggered when kdumping. If you are lucky enough, you will have a kdump. Otherwise, this is a dying message.\n");
+
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ sync_core();
+}
+#endif
+
#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int vector, int flags)
{
@@ -2104,6 +2134,23 @@ static void mce_syscore_shutdown(void)
mce_disable_error_reporting();
}

+#ifdef CONFIG_KEXEC
+/*
+ * Called in kdump entering code to switch the MCE handler to a primitive and
+ * kdump-specific one.
+ *
+ * In kexec/kdump context, getting kdump is prior to handling MCEs because
+ * what the users are really interested in is to find what caused the crashing,
+ * not what caused the crashing to fail. So the kdump-specific MCE handler does
+ * very little things not to disrupt kdumping.
+ */
+void switch_mce_handler_for_kdump(void)
+{
+ mca_cfg.kdump_cpu = smp_processor_id();
+ machine_check_vector = machine_check_under_kdump;
+}
+#endif
+
/*
* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
* Only one CPU is active at this time, the others get re-added later using
diff --git v3.19.orig/arch/x86/kernel/crash.c v3.19/arch/x86/kernel/crash.c
index 6f3baedcb6f6..273805e772f6 100644
--- v3.19.orig/arch/x86/kernel/crash.c
+++ v3.19/arch/x86/kernel/crash.c
@@ -34,6 +34,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
+#include <asm/mce.h>

/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
@@ -166,6 +167,8 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();

+ switch_mce_handler_for_kdump();
+
kdump_nmi_shootdown_cpus();

/*
--
1.9.3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/