Re: [RFC][PATCH 6/6] x86/mce: Dont use noinstr for now

From: Borislav Petkov
Date: Thu Jan 07 2021 - 05:09:01 EST


On Wed, Jan 06, 2021 at 03:57:55PM +0100, Boris Petkov wrote:
> Another thing that we could do is carve out only the stuff which needs
> to be noinstr into a separate compilation unit and disable tracing
> only for that while keeping the rest traceable. Need to try it to see
> how ugly it'll get...

Something like the below, it barely builds.

I haven't found out whether I can even do

ccflags-remove

on a per-file basis, I guess I cannot so that's not there yet.

core_noinstr.c ended up containing all the code needed by the #MC
handler so that should be ok-ish, carve-out-wise.

Also, I've exported a bunch of functions which are in mce/core.c through
the internal.h header so that core_noinstr.c can call them. There are
no more objtool warnings but if it turns out that we have to move those
functions:

+/* core_noinstr.c */
+bool mce_check_crashing_cpu(void);
+void print_mce(struct mce *m);
+void mce_reset(void);
+bool whole_page(struct mce *m);
+u64 mce_rdmsrl(u32 msr);
+void mce_wrmsrl(u32 msr, u64 v);
+void mce_read_aux(struct mce *m, int i);
+void mce_gather_info(struct mce *m, struct pt_regs *regs);

to core_noinstr.c after all, then we can do your solution directly.

Ok, gnight. :-)

---
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
index 9f020c994154..2fa36118a05f 100644
--- a/arch/x86/kernel/cpu/mce/Makefile
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -1,5 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y = core.o severity.o genpool.o
+# No instrumentation for #MC handler code
+KASAN_SANITIZE_core_instr.o := n
+UBSAN_SANITIZE_core_instr.o := n
+KCOV_INSTRUMENT_core_instr.o := n
+
+obj-y = core.o core_noinstr.o severity.o genpool.o

obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e133ce1e562b..f5db9b98664d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -31,7 +31,6 @@
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
-#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
@@ -47,11 +46,10 @@

#include <asm/intel-family.h>
#include <asm/processor.h>
-#include <asm/traps.h>
#include <asm/tlbflush.h>
+#include <asm/traps.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/reboot.h>

#include "internal.h"

@@ -61,17 +59,11 @@ static DEFINE_MUTEX(mce_sysfs_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>

-#define SPINUNIT 100 /* 100ns */
-
DEFINE_PER_CPU(unsigned, mce_exception_count);

DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);

-struct mce_bank {
- u64 ctl; /* subevents to enable */
- bool init; /* initialise bank? */
-};
-static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);

#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
@@ -97,9 +89,7 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};

-static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;
-static int cpu_missing;

/*
* MCA banks polled by the period polling timer for corrected events.
@@ -121,7 +111,7 @@ mce_banks_t mce_banks_ce_disabled;
static struct work_struct mce_work;
static struct irq_work mce_irq_work;

-static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);

/*
* CPU/chipset specific EDAC code can register a notifier call here to print
@@ -266,7 +256,7 @@ static void __print_mce(struct mce *m)
m->microcode);
}

-static void print_mce(struct mce *m)
+void print_mce(struct mce *m)
{
__print_mce(m);

@@ -274,86 +264,6 @@ static void print_mce(struct mce *m)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}

-#define PANIC_TIMEOUT 5 /* 5 seconds */
-
-static atomic_t mce_panicked;
-
-static int fake_panic;
-static atomic_t mce_fake_panicked;
-
-/* Panic in progress. Enable interrupts and wait for final IPI */
-static void wait_for_panic(void)
-{
- long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
-
- preempt_disable();
- local_irq_enable();
- while (timeout-- > 0)
- udelay(1);
- if (panic_timeout == 0)
- panic_timeout = mca_cfg.panic_timeout;
- panic("Panicing machine check CPU died");
-}
-
-static void mce_panic(const char *msg, struct mce *final, char *exp)
-{
- int apei_err = 0;
- struct llist_node *pending;
- struct mce_evt_llist *l;
-
- if (!fake_panic) {
- /*
- * Make sure only one CPU runs in machine check panic
- */
- if (atomic_inc_return(&mce_panicked) > 1)
- wait_for_panic();
- barrier();
-
- bust_spinlocks(1);
- console_verbose();
- } else {
- /* Don't log too much for fake panic */
- if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
- }
- pending = mce_gen_pool_prepare_records();
- /* First print corrected ones that are still unlogged */
- llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
- if (!(m->status & MCI_STATUS_UC)) {
- print_mce(m);
- if (!apei_err)
- apei_err = apei_write_mce(m);
- }
- }
- /* Now print uncorrected but with the final one last */
- llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || mce_cmp(m, final)) {
- print_mce(m);
- if (!apei_err)
- apei_err = apei_write_mce(m);
- }
- }
- if (final) {
- print_mce(final);
- if (!apei_err)
- apei_err = apei_write_mce(final);
- }
- if (cpu_missing)
- pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
- if (exp)
- pr_emerg(HW_ERR "Machine check: %s\n", exp);
- if (!fake_panic) {
- if (panic_timeout == 0)
- panic_timeout = mca_cfg.panic_timeout;
- panic(msg);
- } else
- pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
-}
-
/* Support code for software error injection */

static int msr_to_offset(u32 msr)
@@ -392,7 +302,7 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
}

/* MSR access wrappers used for error injection */
-static noinstr u64 mce_rdmsrl(u32 msr)
+u64 mce_rdmsrl(u32 msr)
{
DECLARE_ARGS(val, low, high);

@@ -446,7 +356,7 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
return true;
}

-static noinstr void mce_wrmsrl(u32 msr, u64 v)
+void mce_wrmsrl(u32 msr, u64 v)
{
u32 low, high;

@@ -479,7 +389,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
mce_setup(m);

@@ -586,7 +496,7 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);

-static bool whole_page(struct mce *m)
+bool whole_page(struct mce *m)
{
if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
return true;
@@ -682,7 +592,7 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
@@ -837,608 +747,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
}
EXPORT_SYMBOL_GPL(machine_check_poll);

-/*
- * Do a quick check if any of the events requires a panic.
- * This decides if we keep the events around or clear them.
- */
-static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
- struct pt_regs *regs)
-{
- char *tmp = *msg;
- int i;
-
- for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- m->status = mce_rdmsrl(msr_ops.status(i));
- if (!(m->status & MCI_STATUS_VAL))
- continue;
-
- __set_bit(i, validp);
- if (quirk_no_way_out)
- quirk_no_way_out(i, m, regs);
-
- m->bank = i;
- if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
- mce_read_aux(m, i);
- *msg = tmp;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * Variable to establish order between CPUs while scanning.
- * Each CPU spins initially until executing is equal its number.
- */
-static atomic_t mce_executing;
-
-/*
- * Defines order of CPUs on entry. First CPU becomes Monarch.
- */
-static atomic_t mce_callin;
-
-/*
- * Check if a timeout waiting for other CPUs happened.
- */
-static int mce_timed_out(u64 *t, const char *msg)
-{
- /*
- * The others already did panic for some reason.
- * Bail out like in a timeout.
- * rmb() to tell the compiler that system_state
- * might have been modified by someone else.
- */
- rmb();
- if (atomic_read(&mce_panicked))
- wait_for_panic();
- if (!mca_cfg.monarch_timeout)
- goto out;
- if ((s64)*t < SPINUNIT) {
- if (mca_cfg.tolerant <= 1)
- mce_panic(msg, NULL, NULL);
- cpu_missing = 1;
- return 1;
- }
- *t -= SPINUNIT;
-out:
- touch_nmi_watchdog();
- return 0;
-}
-
-/*
- * The Monarch's reign. The Monarch is the CPU who entered
- * the machine check handler first. It waits for the others to
- * raise the exception too and then grades them. When any
- * error is fatal panic. Only then let the others continue.
- *
- * The other CPUs entering the MCE handler will be controlled by the
- * Monarch. They are called Subjects.
- *
- * This way we prevent any potential data corruption in a unrecoverable case
- * and also makes sure always all CPU's errors are examined.
- *
- * Also this detects the case of a machine check event coming from outer
- * space (not detected by any CPUs) In this case some external agent wants
- * us to shut down, so panic too.
- *
- * The other CPUs might still decide to panic if the handler happens
- * in a unrecoverable place, but in this case the system is in a semi-stable
- * state and won't corrupt anything by itself. It's ok to let the others
- * continue for a bit first.
- *
- * All the spin loops have timeouts; when a timeout happens a CPU
- * typically elects itself to be Monarch.
- */
-static void mce_reign(void)
-{
- int cpu;
- struct mce *m = NULL;
- int global_worst = 0;
- char *msg = NULL;
-
- /*
- * This CPU is the Monarch and the other CPUs have run
- * through their handlers.
- * Grade the severity of the errors of all the CPUs.
- */
- for_each_possible_cpu(cpu) {
- struct mce *mtmp = &per_cpu(mces_seen, cpu);
-
- if (mtmp->severity > global_worst) {
- global_worst = mtmp->severity;
- m = &per_cpu(mces_seen, cpu);
- }
- }
-
- /*
- * Cannot recover? Panic here then.
- * This dumps all the mces in the log buffer and stops the
- * other CPUs.
- */
- if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
- /* call mce_severity() to get "msg" for panic */
- mce_severity(m, NULL, mca_cfg.tolerant, &msg, true);
- mce_panic("Fatal machine check", m, msg);
- }
-
- /*
- * For UC somewhere we let the CPU who detects it handle it.
- * Also must let continue the others, otherwise the handling
- * CPU could deadlock on a lock.
- */
-
- /*
- * No machine check event found. Must be some external
- * source or one CPU is hung. Panic.
- */
- if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal machine check from unknown source", NULL, NULL);
-
- /*
- * Now clear all the mces_seen so that they don't reappear on
- * the next mce.
- */
- for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
-}
-
-static atomic_t global_nwo;
-
-/*
- * Start of Monarch synchronization. This waits until all CPUs have
- * entered the exception handler and then determines if any of them
- * saw a fatal event that requires panic. Then it executes them
- * in the entry order.
- * TBD double check parallel CPU hotunplug
- */
-static int mce_start(int *no_way_out)
-{
- int order;
- int cpus = num_online_cpus();
- u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- return -1;
-
- atomic_add(*no_way_out, &global_nwo);
- /*
- * Rely on the implied barrier below, such that global_nwo
- * is updated before mce_callin.
- */
- order = atomic_inc_return(&mce_callin);
-
- /*
- * Wait for everyone.
- */
- while (atomic_read(&mce_callin) != cpus) {
- if (mce_timed_out(&timeout,
- "Timeout: Not all CPUs entered broadcast exception handler")) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
-
- /*
- * mce_callin should be read before global_nwo
- */
- smp_rmb();
-
- if (order == 1) {
- /*
- * Monarch: Starts executing now, the others wait.
- */
- atomic_set(&mce_executing, 1);
- } else {
- /*
- * Subject: Now start the scanning loop one by one in
- * the original callin order.
- * This way when there are any shared banks it will be
- * only seen by one CPU before cleared, avoiding duplicates.
- */
- while (atomic_read(&mce_executing) < order) {
- if (mce_timed_out(&timeout,
- "Timeout: Subject CPUs unable to finish machine check processing")) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
- }
-
- /*
- * Cache the global no_way_out state.
- */
- *no_way_out = atomic_read(&global_nwo);
-
- return order;
-}
-
-/*
- * Synchronize between CPUs after main scanning loop.
- * This invokes the bulk of the Monarch processing.
- */
-static int mce_end(int order)
-{
- int ret = -1;
- u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- goto reset;
- if (order < 0)
- goto reset;
-
- /*
- * Allow others to run.
- */
- atomic_inc(&mce_executing);
-
- if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
- /*
- * Monarch: Wait for everyone to go through their scanning
- * loops.
- */
- while (atomic_read(&mce_executing) <= cpus) {
- if (mce_timed_out(&timeout,
- "Timeout: Monarch CPU unable to finish machine check processing"))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- mce_reign();
- barrier();
- ret = 0;
- } else {
- /*
- * Subject: Wait for Monarch to finish.
- */
- while (atomic_read(&mce_executing) != 0) {
- if (mce_timed_out(&timeout,
- "Timeout: Monarch CPU did not finish machine check processing"))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- /*
- * Don't reset anything. That's done by the Monarch.
- */
- return 0;
- }
-
- /*
- * Reset all global state.
- */
-reset:
- atomic_set(&global_nwo, 0);
- atomic_set(&mce_callin, 0);
- barrier();
-
- /*
- * Let others run again.
- */
- atomic_set(&mce_executing, 0);
- return ret;
-}
-
-static void mce_clear_state(unsigned long *toclear)
-{
- int i;
-
- for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- if (test_bit(i, toclear))
- mce_wrmsrl(msr_ops.status(i), 0);
- }
-}
-
-/*
- * Cases where we avoid rendezvous handler timeout:
- * 1) If this CPU is offline.
- *
- * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
- * skip those CPUs which remain looping in the 1st kernel - see
- * crash_nmi_callback().
- *
- * Note: there still is a small window between kexec-ing and the new,
- * kdump kernel establishing a new #MC handler where a broadcasted MCE
- * might not get handled properly.
- */
-static noinstr bool mce_check_crashing_cpu(void)
-{
- unsigned int cpu = smp_processor_id();
-
- if (arch_cpu_is_offline(cpu) ||
- (crashing_cpu != -1 && crashing_cpu != cpu)) {
- u64 mcgstatus;
-
- mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
- if (mcgstatus & MCG_STATUS_LMCES)
- return false;
- }
-
- if (mcgstatus & MCG_STATUS_RIPV) {
- __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
- return true;
- }
- }
- return false;
-}
-
-static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks,
- int no_way_out, int *worst)
-{
- struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- struct mca_config *cfg = &mca_cfg;
- int severity, i;
-
- for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- __clear_bit(i, toclear);
- if (!test_bit(i, valid_banks))
- continue;
-
- if (!mce_banks[i].ctl)
- continue;
-
- m->misc = 0;
- m->addr = 0;
- m->bank = i;
-
- m->status = mce_rdmsrl(msr_ops.status(i));
- if (!(m->status & MCI_STATUS_VAL))
- continue;
-
- /*
- * Corrected or non-signaled errors are handled by
- * machine_check_poll(). Leave them alone, unless this panics.
- */
- if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- !no_way_out)
- continue;
-
- /* Set taint even when machine check was not enabled. */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
-
- /*
- * When machine check was for corrected/deferred handler don't
- * touch, unless we're panicking.
- */
- if ((severity == MCE_KEEP_SEVERITY ||
- severity == MCE_UCNA_SEVERITY) && !no_way_out)
- continue;
-
- __set_bit(i, toclear);
-
- /* Machine check event was not enabled. Clear, but ignore. */
- if (severity == MCE_NO_SEVERITY)
- continue;
-
- mce_read_aux(m, i);
-
- /* assuming valid severity level != 0 */
- m->severity = severity;
-
- mce_log(m);
-
- if (severity > *worst) {
- *final = *m;
- *worst = severity;
- }
- }
-
- /* mce_clear_state will clear *final, save locally for use later */
- *m = *final;
-}
-
-static void kill_me_now(struct callback_head *ch)
-{
- force_sig(SIGBUS);
-}
-
-static void kill_me_maybe(struct callback_head *cb)
-{
- struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
- int flags = MF_ACTION_REQUIRED;
-
- pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
-
- if (!p->mce_ripv)
- flags |= MF_MUST_KILL;
-
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
- !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
- sync_core();
- return;
- }
-
- if (p->mce_vaddr != (void __user *)-1l) {
- force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
- } else {
- pr_err("Memory error not recovered");
- kill_me_now(cb);
- }
-}
-
-static void queue_task_work(struct mce *m, int kill_current_task)
-{
- current->mce_addr = m->addr;
- current->mce_kflags = m->kflags;
- current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
- current->mce_whole_page = whole_page(m);
-
- if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
- else
- current->mce_kill_me.func = kill_me_maybe;
-
- task_work_add(current, &current->mce_kill_me, TWA_RESUME);
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- *
- * On Intel systems this is entered on all CPUs in parallel through
- * MCE broadcast. However some CPUs might be broken beyond repair,
- * so be always careful when synchronizing with others.
- *
- * Tracing and kprobes are disabled: if we interrupted a kernel context
- * with IF=1, we need to minimize stack usage. There are also recursion
- * issues: if the machine check was due to a failure of the memory
- * backing the user stack, tracing that reads the user stack will cause
- * potentially infinite recursion.
- */
-noinstr void do_machine_check(struct pt_regs *regs)
-{
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- struct mca_config *cfg = &mca_cfg;
- struct mce m, *final;
- char *msg = NULL;
- int worst = 0;
-
- /*
- * Establish sequential order between the CPUs entering the machine
- * check handler.
- */
- int order = -1;
-
- /*
- * If no_way_out gets set, there is no safe way to recover from this
- * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
- */
- int no_way_out = 0;
-
- /*
- * If kill_current_task is not set, there might be a way to recover from this
- * error.
- */
- int kill_current_task = 0;
-
- /*
- * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
- * on Intel.
- */
- int lmce = 1;
-
- this_cpu_inc(mce_exception_count);
-
- mce_gather_info(&m, regs);
- m.tsc = rdtsc();
-
- final = this_cpu_ptr(&mces_seen);
- *final = m;
-
- memset(valid_banks, 0, sizeof(valid_banks));
- no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
-
- barrier();
-
- /*
- * When no restart IP might need to kill or panic.
- * Assume the worst for now, but if we find the
- * severity is MCE_AR_SEVERITY we have other options.
- */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
- /*
- * Check if this MCE is signaled to only this logical processor,
- * on Intel, Zhaoxin only.
- */
- if (m.cpuvendor == X86_VENDOR_INTEL ||
- m.cpuvendor == X86_VENDOR_ZHAOXIN)
- lmce = m.mcgstatus & MCG_STATUS_LMCES;
-
- /*
- * Local machine check may already know that we have to panic.
- * Broadcast machine check begins rendezvous in mce_start()
- * Go through all banks in exclusion of the other CPUs. This way we
- * don't report duplicated events on shared banks because the first one
- * to see it will clear it.
- */
- if (lmce) {
- if (no_way_out && cfg->tolerant < 3)
- mce_panic("Fatal local machine check", &m, msg);
- } else {
- order = mce_start(&no_way_out);
- }
-
- __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
-
- if (!no_way_out)
- mce_clear_state(toclear);
-
- /*
- * Do most of the synchronization with other CPUs.
- * When there's any problem use only local no_way_out state.
- */
- if (!lmce) {
- if (mce_end(order) < 0) {
- if (!no_way_out)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
-
- if (no_way_out && cfg->tolerant < 3)
- mce_panic("Fatal machine check on current CPU", &m, msg);
- }
- } else {
- /*
- * If there was a fatal machine check we should have
- * already called mce_panic earlier in this function.
- * Since we re-read the banks, we might have found
- * something new. Check again to see if we found a
- * fatal error. We call "mce_severity()" again to
- * make sure we have the right "msg".
- */
- if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
- mce_severity(&m, regs, cfg->tolerant, &msg, true);
- mce_panic("Local fatal machine check!", &m, msg);
- }
- }
-
- if (worst != MCE_AR_SEVERITY && !kill_current_task)
- goto out;
-
- /* Fault was in user mode and we need to take some action */
- if ((m.cs & 3) == 3) {
- /* If this triggers there is no way to recover. Die hard. */
- BUG_ON(!on_thread_stack() || !user_mode(regs));
-
- queue_task_work(&m, kill_current_task);
-
- } else {
- /*
- * Handle an MCE which has happened in kernel space but from
- * which the kernel can recover: ex_has_fault_handler() has
- * already verified that the rIP at which the error happened is
- * a rIP from which the kernel can recover (by jumping to
- * recovery code specified in _ASM_EXTABLE_FAULT()) and the
- * corresponding exception handler which would do that is the
- * proper one.
- */
- if (m.kflags & MCE_IN_KERNEL_RECOV) {
- if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
- }
-
- if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, kill_current_task);
- }
-out:
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-}
-EXPORT_SYMBOL_GPL(do_machine_check);
-
#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
{
@@ -2706,15 +2014,6 @@ struct dentry *mce_get_debugfs_dir(void)
return dmce;
}

-static void mce_reset(void)
-{
- cpu_missing = 0;
- atomic_set(&mce_fake_panicked, 0);
- atomic_set(&mce_executing, 0);
- atomic_set(&mce_callin, 0);
- atomic_set(&global_nwo, 0);
-}
-
static int fake_panic_get(void *data, u64 *val)
{
*val = fake_panic;
diff --git a/arch/x86/kernel/cpu/mce/core_noinstr.c b/arch/x86/kernel/cpu/mce/core_noinstr.c
new file mode 100644
index 000000000000..ac03af804726
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/core_noinstr.c
@@ -0,0 +1,712 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/signal.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+
+#include <asm/set_memory.h>
+#include <asm/sync_core.h>
+#include <asm/extable.h>
+#include <asm/reboot.h>
+#include <asm/delay.h>
+#include <asm/traps.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+/* 5 seconds */
+#define PANIC_TIMEOUT 5
+/* 100ns */
+#define SPINUNIT 100
+
+static atomic_t global_nwo;
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+static atomic_t mce_panicked;
+static atomic_t mce_fake_panicked;
+static int cpu_missing;
+
+int fake_panic;
+
+static DEFINE_PER_CPU(struct mce, mces_seen);
+
+void mce_reset(void)
+{
+ cpu_missing = 0;
+ atomic_set(&mce_fake_panicked, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+}
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static void mce_panic(const char *msg, struct mce *final, char *exp)
+{
+ int apei_err = 0;
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
+
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_panicked) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
+ return;
+ }
+ pending = mce_gen_pool_prepare_records();
+ /* First print corrected ones that are still unlogged */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
+ if (!(m->status & MCI_STATUS_UC)) {
+ print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ /* Now print uncorrected but with the final one last */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || mce_cmp(m, final)) {
+ print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ if (final) {
+ print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(final);
+ }
+ if (cpu_missing)
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
+ if (exp)
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic(msg);
+ } else
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+}
+
+static void kill_me_now(struct callback_head *ch)
+{
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+
+ if (!p->mce_ripv)
+ flags |= MF_MUST_KILL;
+
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
+ !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+ sync_core();
+ return;
+ }
+
+ if (p->mce_vaddr != (void __user *)-1l) {
+ force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
+ } else {
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+ }
+}
+
+static void queue_task_work(struct mce *m, int kill_current_task)
+{
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+
+ if (kill_current_task)
+ current->mce_kill_me.func = kill_me_now;
+ else
+ current->mce_kill_me.func = kill_me_maybe;
+
+ task_work_add(current, &current->mce_kill_me, TWA_RESUME);
+}
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
+{
+ char *tmp = *msg;
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
+
+ m->bank = i;
+ if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(m, i);
+ *msg = tmp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t, const char *msg)
+{
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_panicked))
+ wait_for_panic();
+ if (!mca_cfg.monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ if (mca_cfg.tolerant <= 1)
+ mce_panic(msg, NULL, NULL);
+ cpu_missing = 1;
+ return 1;
+ }
+ *t -= SPINUNIT;
+out:
+ touch_nmi_watchdog();
+ return 0;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ int cpu;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ struct mce *mtmp = &per_cpu(mces_seen, cpu);
+
+ if (mtmp->severity > global_worst) {
+ global_worst = mtmp->severity;
+ m = &per_cpu(mces_seen, cpu);
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ /* call mce_severity() to get "msg" for panic */
+ mce_severity(m, NULL, mca_cfg.tolerant, &msg, true);
+ mce_panic("Fatal machine check", m, msg);
+ }
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the mces_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int *no_way_out)
+{
+ int order;
+ int cpus = num_online_cpus();
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ return -1;
+
+ atomic_add(*no_way_out, &global_nwo);
+ /*
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
+ */
+ order = atomic_inc_return(&mce_callin);
+
+ /*
+ * Wait for everyone.
+ */
+ while (atomic_read(&mce_callin) != cpus) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = atomic_read(&global_nwo);
+
+ return order;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+ int ret = -1;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /* CHECKME: Can this race with a parallel hotplug? */
+ int cpus = num_online_cpus();
+
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= cpus) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ return 0;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+ return ret;
+}
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+bool noinstr mce_check_crashing_cpu(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if (arch_cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+
+static void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (test_bit(i, toclear))
+ mce_wrmsrl(msr_ops.status(i), 0);
+ }
+}
+
+static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks,
+ int no_way_out, int *worst)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ __clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ __set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(m, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ mce_log(m);
+
+ if (severity > *worst) {
+ *final = *m;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *m = *final;
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ *
+ * Tracing and kprobes are disabled: if we interrupted a kernel context
+ * with IF=1, we need to minimize stack usage. There are also recursion
+ * issues: if the machine check was due to a failure of the memory
+ * backing the user stack, tracing that reads the user stack will cause
+ * potentially infinite recursion.
+ */
+void do_machine_check(struct pt_regs *regs)
+{
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ struct mca_config *cfg = &mca_cfg;
+ struct mce m, *final;
+ char *msg = NULL;
+ int worst = 0;
+
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ int order = -1;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
+ */
+ int no_way_out = 0;
+
+ /*
+ * If kill_current_task is not set, there might be a way to recover from this
+ * error.
+ */
+ int kill_current_task = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ int lmce = 1;
+
+ this_cpu_inc(mce_exception_count);
+
+ mce_gather_info(&m, regs);
+ m.tsc = rdtsc();
+
+ final = this_cpu_ptr(&mces_seen);
+ *final = m;
+
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+
+ barrier();
+
+ /*
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
+ */
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
+ /*
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel, Zhaoxin only.
+ */
+ if (m.cpuvendor == X86_VENDOR_INTEL ||
+ m.cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it.
+ */
+ if (lmce) {
+ if (no_way_out && cfg->tolerant < 3)
+ mce_panic("Fatal local machine check", &m, msg);
+ } else {
+ order = mce_start(&no_way_out);
+ }
+
+ __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (!lmce) {
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ if (no_way_out && cfg->tolerant < 3)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+ }
+ } else {
+ /*
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
+ */
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ mce_severity(&m, regs, cfg->tolerant, &msg, true);
+ mce_panic("Local fatal machine check!", &m, msg);
+ }
+ }
+
+ if (worst != MCE_AR_SEVERITY && !kill_current_task)
+ goto out;
+
+ /* Fault was in user mode and we need to take some action */
+ if ((m.cs & 3) == 3) {
+ /* If this triggers there is no way to recover. Die hard. */
+ BUG_ON(!on_thread_stack() || !user_mode(regs));
+
+ queue_task_work(&m, kill_current_task);
+
+ } else {
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &m, msg);
+ }
+
+ if (m.kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&m, kill_current_task);
+ }
+out:
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 88dcc79cfb07..d662f0246e46 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -11,6 +11,15 @@
/* Pointer to the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *);

+struct mce_bank {
+ u64 ctl; /* subevents to enable */
+ bool init; /* initialise bank? */
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+extern int fake_panic;
+extern void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
@@ -196,4 +205,14 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
unsigned long error_code,
unsigned long fault_addr);

+/* core_noinstr.c */
+bool mce_check_crashing_cpu(void);
+void print_mce(struct mce *m);
+void mce_reset(void);
+bool whole_page(struct mce *m);
+u64 mce_rdmsrl(u32 msr);
+void mce_wrmsrl(u32 msr, u64 v);
+void mce_read_aux(struct mce *m, int i);
+void mce_gather_info(struct mce *m, struct pt_regs *regs);
+
#endif /* __X86_MCE_INTERNAL_H__ */

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette