[tip:ras/core] x86/MCE: Dump MCE to dmesg if no consumers

From: tip-bot for Borislav Petkov
Date: Tue Nov 08 2016 - 11:20:14 EST


Commit-ID: cd9c57cad3fe89ea949b9266cddc947c0838f7af
Gitweb: http://git.kernel.org/tip/cd9c57cad3fe89ea949b9266cddc947c0838f7af
Author: Borislav Petkov <bp@xxxxxxx>
AuthorDate: Tue, 1 Nov 2016 12:52:27 +0100
Committer: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
CommitDate: Tue, 8 Nov 2016 17:10:13 +0100

x86/MCE: Dump MCE to dmesg if no consumers

When there are no error record consumers registered with the kernel, the
only thing that appears in dmesg is something like:

[ 300.000326] mce: [Hardware Error]: Machine check events logged

and the error records are gone. Which is seriously counterproductive.

So let's dump them to dmesg instead, in such a case.

Requested-by: Eric Morton <Eric.Morton@xxxxxxx>
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Cc: Tony Luck <tony.luck@xxxxxxxxx>
Link: http://lkml.kernel.org/r/20161101120911.13163-4-bp@xxxxxxxxx
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/mcheck/mce.c | 52 +++++++++++++++++++++++++++++++++++-----
1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf45..4ca0047 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -207,8 +207,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);

static struct notifier_block mce_srao_nb;

+static atomic_t num_notifiers;
+
void mce_register_decode_chain(struct notifier_block *nb)
{
+ atomic_inc(&num_notifiers);
+
/* Ensure SRAO notifier has the highest priority in the decode chain. */
if (nb != &mce_srao_nb && nb->priority == INT_MAX)
nb->priority -= 1;
@@ -219,6 +223,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain);

void mce_unregister_decode_chain(struct notifier_block *nb)
{
+ atomic_dec(&num_notifiers);
+
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -270,17 +276,17 @@ struct mca_msr_regs msr_ops = {
.misc = misc_reg
};

-static void print_mce(struct mce *m)
+static void __print_mce(struct mce *m)
{
- int ret = 0;
-
- pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);

if (m->ip) {
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
+ m->cs, m->ip);

if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
@@ -308,6 +314,13 @@ static void print_mce(struct mce *m)
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
cpu_data(m->extcpu).microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+ int ret = 0;
+
+ __print_mce(m);

/*
* Print out human-readable details about the MCE error,
@@ -569,6 +582,32 @@ static struct notifier_block mce_srao_nb = {
.priority = INT_MAX,
};

+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ /*
+ * Run the default notifier if we have only the SRAO
+ * notifier and us registered.
+ */
+ if (atomic_read(&num_notifiers) > 2)
+ return NOTIFY_DONE;
+
+ __print_mce(m);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = 0,
+};
+
/*
* Read ADDR and MISC registers.
*/
@@ -2138,6 +2177,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();

INIT_WORK(&mce_work, mce_process_work);