[GIT pull] RAS updates for 4.13

From: Thomas Gleixner
Date: Mon Jul 03 2017 - 04:37:16 EST


Linus,

please pull the latest ras-core-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git ras-core-for-linus

The RAS updates for the 4.13 merge window:

- Cleanup of the MCE injection facility (Borsilav Petkov)

- Rework of the AMD/SMCA handling (Yazen Ghannam)

- Enhancements for ACPI/APEI to handle new notitication types (Shiju
Jose)

- atomic_t to refcount_t conversion (Elena Reshetova)

- A few fixes and enhancements all over the place

Thanks,

tglx

------------------>
Borislav Petkov (4):
x86/mce: Merge mce_amd_inj into mce-inject
x86/mce: Get rid of register_mce_write_callback()
x86/mce: Clean up include files
x86/mce/mce-inject: Preset the MCE injection struct

Christophe JAILLET (1):
RAS/CEC: Check the correct variable in the debugfs error handling

Elena Reshetova (1):
x86/mce: Convert threshold_bank.cpus from atomic_t to refcount_t

Juergen Gross (1):
x86/MCE, xen/mcelog: Make /dev/mcelog registration messages more precise

Shiju Jose (1):
ACPI/APEI: Handle GSIV and GPIO notification types

Wei Yongjun (1):
RAS: Make local function parse_ras_param() static

Yazen Ghannam (7):
x86/mce/AMD: Redo error logging from APIC LVT interrupt handlers
x86/mce/AMD: Carve out SMCA bank configuration
x86/mce/AMD: Use msr_stat when clearing MCA_STATUS
x86/mce/AMD: Use saved threshold block info in interrupt handler
x86/mce: Don't disable MCA banks when offlining a CPU on AMD
x86/mce: Update bootlog description to reflect behavior on AMD
x86/mce: Always save severity in machine_check_poll()


Documentation/x86/x86_64/boot-options.txt | 3 +-
arch/x86/Kconfig | 2 +-
arch/x86/Makefile | 2 -
arch/x86/include/asm/amd_nb.h | 3 +-
arch/x86/include/asm/mce.h | 4 -
arch/x86/include/asm/processor.h | 5 +
arch/x86/kernel/cpu/mcheck/dev-mcelog.c | 55 ++-
arch/x86/kernel/cpu/mcheck/mce-inject.c | 569 +++++++++++++++++++++++++++---
arch/x86/kernel/cpu/mcheck/mce-internal.h | 6 +-
arch/x86/kernel/cpu/mcheck/mce.c | 17 +-
arch/x86/kernel/cpu/mcheck/mce_amd.c | 273 +++++++-------
arch/x86/ras/Kconfig | 11 -
arch/x86/ras/Makefile | 2 -
arch/x86/ras/mce_amd_inj.c | 492 --------------------------
drivers/acpi/apei/ghes.c | 39 +-
drivers/ras/cec.c | 2 +-
drivers/ras/ras.c | 2 +-
drivers/xen/mcelog.c | 2 +
18 files changed, 759 insertions(+), 730 deletions(-)
delete mode 100644 arch/x86/ras/Makefile
delete mode 100644 arch/x86/ras/mce_amd_inj.c

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 61b611e9eeaf..b297c48389b9 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -36,7 +36,8 @@ Machine check
to broadcast MCEs.
mce=bootlog
Enable logging of machine checks left over from booting.
- Disabled by default on AMD because some BIOS leave bogus ones.
+ Disabled by default on AMD Fam10h and older because some BIOS
+ leave bogus ones.
If your BIOS doesn't do that it's a good idea to enable though
to make sure you log even machine check events that result
in a reboot. On Intel systems it is enabled by default.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0efb4c9497bc..4371b6b5cbe4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1082,7 +1082,7 @@ config X86_MCE_THRESHOLD
def_bool y

config X86_MCE_INJECT
- depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
+ depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS
tristate "Machine check injector support"
---help---
Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index bf240b920473..ad2db82e9953 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -257,8 +257,6 @@ drivers-$(CONFIG_PM) += arch/x86/power/

drivers-$(CONFIG_FB) += arch/x86/video/

-drivers-$(CONFIG_RAS) += arch/x86/ras/
-
####
# boot loader support. Several targets are kept for legacy purposes

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 00c88a01301d..da181ad1d5f8 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,6 +3,7 @@

#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/refcount.h>

struct amd_nb_bus_dev_range {
u8 bus;
@@ -55,7 +56,7 @@ struct threshold_bank {
struct threshold_block *blocks;

/* initialized to the number of CPUs on the node sharing this bank */
- atomic_t cpus;
+ refcount_t cpus;
};

struct amd_northbridge {
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 3f9a3d2a5209..181264989db5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -285,10 +285,6 @@ int mce_notify_irq(void);

DECLARE_PER_CPU(struct mce, injectm);

-extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off));
-
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..71f6fba95aa6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -901,8 +901,13 @@ static inline int mpx_disable_management(void)
}
#endif /* CONFIG_X86_INTEL_MPX */

+#ifdef CONFIG_CPU_SUP_AMD
extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
+#else
+static inline u16 amd_get_nb_id(int cpu) { return 0; }
+static inline u32 amd_get_nodes_per_socket(void) { return 0; }
+#endif

static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
index 9c632cb88546..10cec43aac38 100644
--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -17,6 +17,8 @@

#include "mce-internal.h"

+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
static DEFINE_MUTEX(mce_chrdev_read_mutex);

static char mce_helper[128];
@@ -345,24 +347,49 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
}
}

-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);

-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
+void mce_unregister_injector_chain(struct notifier_block *nb)
{
- mce_write = fn;
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);

static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffie or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
}

static const struct file_operations mce_chrdev_ops = {
@@ -388,9 +415,15 @@ static __init int dev_mcelog_init_device(void)
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err) {
- pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
return err;
}
+
mce_register_decode_chain(&dev_mcelog_nb);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 99165b206df3..231ad23b24a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -10,23 +10,105 @@
* Authors:
* Andi Kleen
* Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@xxxxxxxxx>
+ * Advanced Micro Devices Inc.
*/
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/timer.h>
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
+#include <linux/module.h>
#include <linux/notifier.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/mce.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd_nb.h>
#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "mce-internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+static u8 n_banks;
+
+#define MAX_FLAG_OPT_SIZE 3
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+}

/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -143,7 +225,7 @@ static int raise_local(void)
return ret;
}

-static void raise_mce(struct mce *m)
+static void __maybe_unused raise_mce(struct mce *m)
{
int context = MCJ_CTX(m->inject_flags);

@@ -198,55 +280,454 @@ static void raise_mce(struct mce *m)
}
}

-/* Error injection interface */
-static ssize_t mce_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
{
- struct mce m;
+ struct mce *m = (struct mce *)data;

- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /*
- * There are some cases where real MSR reads could slip
- * through.
- */
- if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
- return -EIO;
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);

- if ((unsigned long)usize > sizeof(struct mce))
- usize = sizeof(struct mce);
- if (copy_from_user(&m, ubuf, usize))
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;

- if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 cores_per_node;
+
+ cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
+
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+ } else {
+ wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ u64 mcg_status = 0;
+ unsigned int cpu = i_mce.extcpu;
+ u8 b = i_mce.bank;
+
+ rdtscll(i_mce.tsc);
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ mce_inject_log(&i_mce);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;

/*
- * Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
*/
- schedule_timeout(2);
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ }

- mutex_lock(&mce_inject_mutex);
- raise_mce(&m);
- mutex_unlock(&mce_inject_mutex);
- return usize;
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+ cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ }
+
+ get_online_cpus();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= n_banks) {
+ pr_err("Non-existent MCE bank: %llu\n", val);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+ do_inject();
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ struct dentry *d;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init debugfs_init(void)
+{
+ unsigned int i;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+ if (!dfs_inj)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+ dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+ dfs_fls[i].perm,
+ dfs_inj,
+ &i_mce,
+ dfs_fls[i].fops);
+
+ if (!dfs_fls[i].d)
+ goto err_dfs_add;
+ }
+
+ return 0;
+
+err_dfs_add:
+ while (i-- > 0)
+ debugfs_remove(dfs_fls[i].d);
+
+ debugfs_remove(dfs_inj);
+ dfs_inj = NULL;
+
+ return -ENODEV;
}

-static int inject_init(void)
+static int __init inject_init(void)
{
+ int err;
+
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
+
+ err = debugfs_init();
+ if (err) {
+ free_cpumask_var(mce_inject_cpumask);
+ return err;
+ }
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
pr_info("Machine check injector initialized\n");
- register_mce_write_callback(mce_write);
- register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
- "mce_notify");
+
return 0;
}

+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
module_init(inject_init);
-/*
- * Cannot tolerate unloading currently because we cannot
- * guarantee all openers of mce_chrdev will get a reference to us.
- */
+module_exit(inject_exit);
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 654ad0668d72..098530a93bb7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -100,7 +100,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
extern struct device_attribute dev_attr_trigger;

#ifdef CONFIG_X86_MCELOG_LEGACY
-extern void mce_work_trigger(void);
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
#else
static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5cfbaeb6529a..6dde0497efc7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -673,7 +673,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
bool error_seen = false;
struct mce m;
- int severity;
int i;

this_cpu_inc(mce_poll_count);
@@ -710,11 +709,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)

mce_read_aux(&m, i);

- severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
- if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
- if (m.status & MCI_STATUS_ADDRV)
- m.severity = severity;
+ m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);

/*
* Don't get the IP here because it's unlikely to
@@ -1550,7 +1545,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
- if (c->x86 < 17 && cfg->bootlog < 0) {
+ if (c->x86 < 0x11 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
@@ -1832,7 +1827,8 @@ void mce_disable_bank(int bank)
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
* monarchtimeout is how long to wait for other CPUs on machine
* check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
* mce=recovery force enable memcpy_mcsafe()
@@ -1912,12 +1908,13 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
+ * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
* Disabling them for just a single offlined CPU is bad, since it will
* inhibit reporting for all shared resources on the socket like the
* last level cache (LLC), the integrated memory controller (iMC), etc.
*/
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return;

mce_disable_error_reporting();
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 6e4a047e4b68..9e314bcf67cc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;

-static void get_smca_bank_info(unsigned int bank)
+static void smca_configure(unsigned int bank, unsigned int cpu)
{
- unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ unsigned int i, hwid_mcatype;
struct smca_hwid *s_hwid;
- u32 high, instance_id;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ high |= BIT(5);
+
+ wrmsr(smca_config, low, high);
+ }

/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;

- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
@@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank)
smca_get_name(s_hwid->bank_type));

smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = instance_id;
+ smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
break;
}
@@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
- u32 smca_low, smca_high, smca_addr;
+ u32 smca_low, smca_high;
struct threshold_block b;
int new;

@@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
goto set_offset;
}

- smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-
- if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
- /*
- * OS is required to set the MCAX bit to acknowledge that it is
- * now using the new MSR ranges and new registers under each
- * bank. It also means that the OS will configure deferred
- * errors in the new MCx_CONFIG register. If the bit is not set,
- * uncorrectable errors will cause a system panic.
- *
- * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
- */
- smca_high |= BIT(0);
-
- /*
- * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
- * registers with the option of additionally logging to
- * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
- *
- * This bit is usually set by BIOS to retain the old behavior
- * for OSes that don't use the new registers. Linux supports the
- * new registers so let's disable that additional logging here.
- *
- * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
- * portion of the MSR).
- */
- smca_high &= ~BIT(2);
-
- /*
- * SMCA sets the Deferred Error Interrupt type per bank.
- *
- * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
- * if the DeferredIntType bit field is available.
- *
- * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
- * high portion of the MSR). OS should set this to 0x1 to enable
- * APIC based interrupt. First, check that no interrupt has been
- * set.
- */
- if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
- smca_high |= BIT(5);
-
- wrmsr(smca_addr, smca_low, smca_high);
- }
-
/* Gather LVT offset for thresholding: */
if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
goto out;
@@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)

for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
- get_smca_bank_info(bank);
+ smca_configure(bank, cpu);

for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(cpu, address, low, high, bank, block);
@@ -755,37 +741,19 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);

-static void
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- u32 msr_status = msr_ops.status(bank);
- u32 msr_addr = msr_ops.addr(bank);
struct mce m;
- u64 status;
-
- WARN_ON_ONCE(deferred_err && threshold_err);
-
- if (deferred_err && mce_flags.smca) {
- msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
- msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
- }
-
- rdmsrl(msr_status, status);
-
- if (!(status & MCI_STATUS_VAL))
- return;

mce_setup(&m);

m.status = status;
+ m.misc = misc;
m.bank = bank;
m.tsc = rdtsc();

- if (threshold_err)
- m.misc = misc;
-
if (m.status & MCI_STATUS_ADDRV) {
- rdmsrl(msr_addr, m.addr);
+ m.addr = addr;

/*
* Extract [55:<lsb>] where lsb is the least significant
@@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
}

mce_log(&m);
-
- wrmsrl(msr_status, 0);
}

static inline void __smp_deferred_error_interrupt(void)
@@ -832,87 +798,126 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
exiting_ack_irq();
}

-/* APIC interrupt handler for deferred errors */
-static void amd_deferred_error_interrupt(void)
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
- unsigned int bank;
- u32 msr_status;
- u64 status;
+ u64 status, addr = 0;

- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
- : msr_ops.status(bank);
+ rdmsrl(msr_stat, status);
+ if (!(status & MCI_STATUS_VAL))
+ return false;

- rdmsrl(msr_status, status);
+ if (status & MCI_STATUS_ADDRV)
+ rdmsrl(msr_addr, addr);

- if (!(status & MCI_STATUS_VAL) ||
- !(status & MCI_STATUS_DEFERRED))
- continue;
+ __log_error(bank, status, addr, misc);

- __log_error(bank, true, false, 0);
- break;
- }
+ wrmsrl(msr_stat, 0);
+
+ return status & MCI_STATUS_DEFERRED;
}

/*
- * APIC Interrupt Handler
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
*/
+static void log_error_deferred(unsigned int bank)
+{
+ bool defrd;

-/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
- */
+ defrd = _log_error_bank(bank, msr_ops.status(bank),
+ msr_ops.addr(bank), 0);

-static void amd_threshold_interrupt(void)
+ if (!mce_flags.smca)
+ return;
+
+ /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+ if (defrd) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return;
+ }
+
+ /*
+ * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+ * for a valid error.
+ */
+ _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+ MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
{
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block, cpu = smp_processor_id();
- struct thresh_restart tr;
+ unsigned int bank;

- /* assume first bank caused it */
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(cpu, address, low, high, bank, block);
- if (!address)
- break;
+ for (bank = 0; bank < mca_cfg.banks; ++bank)
+ log_error_deferred(bank);
+}

- if (rdmsr_safe(address, &low, &high))
- break;
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+ _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}

- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
+static void log_and_reset_block(struct threshold_block *block)
+{
+ struct thresh_restart tr;
+ u32 low = 0, high = 0;

- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
+ if (!block)
+ return;

- /*
- * Log the machine check that caused the threshold
- * event.
- */
- if (high & MASK_OVERFLOW_HI)
- goto log;
- }
- }
- return;
+ if (rdmsr_safe(block->address, &low, &high))
+ return;
+
+ if (!(high & MASK_OVERFLOW_HI))
+ return;

-log:
- __log_error(bank, false, true, ((u64)high << 32) | low);
+ /* Log the MCE which caused the threshold event. */
+ log_error_thresholding(block->bank, ((u64)high << 32) | low);

/* Reset threshold block after logging error. */
memset(&tr, 0, sizeof(tr));
- tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ tr.b = block;
threshold_restart_bank(&tr);
}

/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ unsigned int bank, cpu = smp_processor_id();
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+
+ first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ if (!first_block)
+ continue;
+
+ /*
+ * The first block is also the head of the list. Check it first
+ * before iterating over the rest.
+ */
+ log_and_reset_block(first_block);
+ list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+ log_and_reset_block(block);
+ }
+}
+
+/*
* Sysfs Interface
*/

@@ -1202,7 +1207,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;

per_cpu(threshold_banks, cpu)[bank] = b;
- atomic_inc(&b->cpus);
+ refcount_inc(&b->cpus);

err = __threshold_add_blocks(b);

@@ -1225,7 +1230,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;

if (is_shared_bank(bank)) {
- atomic_set(&b->cpus, 1);
+ refcount_set(&b->cpus, 1);

/* nb is already initialized, see above */
if (nb) {
@@ -1289,7 +1294,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
goto free_out;

if (is_shared_bank(bank)) {
- if (!atomic_dec_and_test(&b->cpus)) {
+ if (!refcount_dec_and_test(&b->cpus)) {
__threshold_remove_blocks(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 2a2d89d39af6..bb026699ad19 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -1,13 +1,3 @@
-config MCE_AMD_INJ
- tristate "Simple MCE injection interface for AMD processors"
- depends on RAS && X86_MCE && DEBUG_FS && AMD_NB
- default n
- help
- This is a simple debugfs interface to inject MCEs and test different
- aspects of the MCE handling code.
-
- WARNING: Do not even assume this interface is staying stable!
-
config RAS_CEC
bool "Correctable Errors Collector"
depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
@@ -20,4 +10,3 @@ config RAS_CEC

Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
-
diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile
deleted file mode 100644
index 5f94546db280..000000000000
--- a/arch/x86/ras/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o
-
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
deleted file mode 100644
index 8730c2882fff..000000000000
--- a/arch/x86/ras/mce_amd_inj.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- * A simple MCE injection facility for testing different aspects of the RAS
- * code. This driver should be built as module so that it can be loaded
- * on production kernels for testing purposes.
- *
- * This file may be distributed under the terms of the GNU General Public
- * License version 2.
- *
- * Copyright (c) 2010-15: Borislav Petkov <bp@xxxxxxxxx>
- * Advanced Micro Devices Inc.
- */
-
-#include <linux/kobject.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/pci.h>
-
-#include <asm/mce.h>
-#include <asm/smp.h>
-#include <asm/amd_nb.h>
-#include <asm/irq_vectors.h>
-
-#include "../kernel/cpu/mcheck/mce-internal.h"
-
-/*
- * Collect all the MCi_XXX settings
- */
-static struct mce i_mce;
-static struct dentry *dfs_inj;
-
-static u8 n_banks;
-
-#define MAX_FLAG_OPT_SIZE 3
-#define NBCFG 0x44
-
-enum injection_type {
- SW_INJ = 0, /* SW injection, simply decode the error */
- HW_INJ, /* Trigger a #MC */
- DFR_INT_INJ, /* Trigger Deferred error interrupt */
- THR_INT_INJ, /* Trigger threshold interrupt */
- N_INJ_TYPES,
-};
-
-static const char * const flags_options[] = {
- [SW_INJ] = "sw",
- [HW_INJ] = "hw",
- [DFR_INT_INJ] = "df",
- [THR_INT_INJ] = "th",
- NULL
-};
-
-/* Set default injection to SW_INJ */
-static enum injection_type inj_type = SW_INJ;
-
-#define MCE_INJECT_SET(reg) \
-static int inj_##reg##_set(void *data, u64 val) \
-{ \
- struct mce *m = (struct mce *)data; \
- \
- m->reg = val; \
- return 0; \
-}
-
-MCE_INJECT_SET(status);
-MCE_INJECT_SET(misc);
-MCE_INJECT_SET(addr);
-MCE_INJECT_SET(synd);
-
-#define MCE_INJECT_GET(reg) \
-static int inj_##reg##_get(void *data, u64 *val) \
-{ \
- struct mce *m = (struct mce *)data; \
- \
- *val = m->reg; \
- return 0; \
-}
-
-MCE_INJECT_GET(status);
-MCE_INJECT_GET(misc);
-MCE_INJECT_GET(addr);
-MCE_INJECT_GET(synd);
-
-DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
-
-/*
- * Caller needs to be make sure this cpu doesn't disappear
- * from under us, i.e.: get_cpu/put_cpu.
- */
-static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
-{
- u32 l, h;
- int err;
-
- err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
- if (err) {
- pr_err("%s: error reading HWCR\n", __func__);
- return err;
- }
-
- enable ? (l |= BIT(18)) : (l &= ~BIT(18));
-
- err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
- if (err)
- pr_err("%s: error writing HWCR\n", __func__);
-
- return err;
-}
-
-static int __set_inj(const char *buf)
-{
- int i;
-
- for (i = 0; i < N_INJ_TYPES; i++) {
- if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
- inj_type = i;
- return 0;
- }
- }
- return -EINVAL;
-}
-
-static ssize_t flags_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_FLAG_OPT_SIZE];
- int n;
-
- n = sprintf(buf, "%s\n", flags_options[inj_type]);
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
-}
-
-static ssize_t flags_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_FLAG_OPT_SIZE], *__buf;
- int err;
-
- if (cnt > MAX_FLAG_OPT_SIZE)
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt - 1] = 0;
-
- /* strip whitespace */
- __buf = strstrip(buf);
-
- err = __set_inj(__buf);
- if (err) {
- pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
- return err;
- }
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static const struct file_operations flags_fops = {
- .read = flags_read,
- .write = flags_write,
- .llseek = generic_file_llseek,
-};
-
-/*
- * On which CPU to inject?
- */
-MCE_INJECT_GET(extcpu);
-
-static int inj_extcpu_set(void *data, u64 val)
-{
- struct mce *m = (struct mce *)data;
-
- if (val >= nr_cpu_ids || !cpu_online(val)) {
- pr_err("%s: Invalid CPU: %llu\n", __func__, val);
- return -EINVAL;
- }
- m->extcpu = val;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
-
-static void trigger_mce(void *info)
-{
- asm volatile("int $18");
-}
-
-static void trigger_dfr_int(void *info)
-{
- asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
-}
-
-static void trigger_thr_int(void *info)
-{
- asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
-}
-
-static u32 get_nbc_for_node(int node_id)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 cores_per_node;
-
- cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
-
- return cores_per_node * node_id;
-}
-
-static void toggle_nb_mca_mst_cpu(u16 nid)
-{
- struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
- u32 val;
- int err;
-
- if (!F3)
- return;
-
- err = pci_read_config_dword(F3, NBCFG, &val);
- if (err) {
- pr_err("%s: Error reading F%dx%03x.\n",
- __func__, PCI_FUNC(F3->devfn), NBCFG);
- return;
- }
-
- if (val & BIT(27))
- return;
-
- pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
- __func__);
-
- val |= BIT(27);
- err = pci_write_config_dword(F3, NBCFG, val);
- if (err)
- pr_err("%s: Error writing F%dx%03x.\n",
- __func__, PCI_FUNC(F3->devfn), NBCFG);
-}
-
-static void prepare_msrs(void *info)
-{
- struct mce m = *(struct mce *)info;
- u8 b = m.bank;
-
- wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
-
- if (boot_cpu_has(X86_FEATURE_SMCA)) {
- if (m.inject_flags == DFR_INT_INJ) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
- } else {
- wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
- }
-
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
- wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
- } else {
- wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
- wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
- }
-}
-
-static void do_inject(void)
-{
- u64 mcg_status = 0;
- unsigned int cpu = i_mce.extcpu;
- u8 b = i_mce.bank;
-
- rdtscll(i_mce.tsc);
-
- if (i_mce.misc)
- i_mce.status |= MCI_STATUS_MISCV;
-
- if (i_mce.synd)
- i_mce.status |= MCI_STATUS_SYNDV;
-
- if (inj_type == SW_INJ) {
- mce_inject_log(&i_mce);
- return;
- }
-
- /* prep MCE global settings for the injection */
- mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
-
- if (!(i_mce.status & MCI_STATUS_PCC))
- mcg_status |= MCG_STATUS_RIPV;
-
- /*
- * Ensure necessary status bits for deferred errors:
- * - MCx_STATUS[Deferred]: make sure it is a deferred error
- * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
- */
- if (inj_type == DFR_INT_INJ) {
- i_mce.status |= MCI_STATUS_DEFERRED;
- i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
- }
-
- /*
- * For multi node CPUs, logging and reporting of bank 4 errors happens
- * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
- * Fam10h and later BKDGs.
- */
- if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
- b == 4 &&
- boot_cpu_data.x86 < 0x17) {
- toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
- cpu = get_nbc_for_node(amd_get_nb_id(cpu));
- }
-
- get_online_cpus();
- if (!cpu_online(cpu))
- goto err;
-
- toggle_hw_mce_inject(cpu, true);
-
- i_mce.mcgstatus = mcg_status;
- i_mce.inject_flags = inj_type;
- smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
-
- toggle_hw_mce_inject(cpu, false);
-
- switch (inj_type) {
- case DFR_INT_INJ:
- smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
- break;
- case THR_INT_INJ:
- smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
- break;
- default:
- smp_call_function_single(cpu, trigger_mce, NULL, 0);
- }
-
-err:
- put_online_cpus();
-
-}
-
-/*
- * This denotes into which bank we're injecting and triggers
- * the injection, at the same time.
- */
-static int inj_bank_set(void *data, u64 val)
-{
- struct mce *m = (struct mce *)data;
-
- if (val >= n_banks) {
- pr_err("Non-existent MCE bank: %llu\n", val);
- return -EINVAL;
- }
-
- m->bank = val;
- do_inject();
-
- return 0;
-}
-
-MCE_INJECT_GET(bank);
-
-DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
-
-static const char readme_msg[] =
-"Description of the files and their usages:\n"
-"\n"
-"Note1: i refers to the bank number below.\n"
-"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
-"as they mirror the hardware registers.\n"
-"\n"
-"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
-"\t attributes of the error which caused the MCE.\n"
-"\n"
-"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
-"\t used for error thresholding purposes and its validity is indicated by\n"
-"\t MCi_STATUS[MiscV].\n"
-"\n"
-"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
-"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
-"\n"
-"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
-"\t associated with the error.\n"
-"\n"
-"cpu:\t The CPU to inject the error on.\n"
-"\n"
-"bank:\t Specify the bank you want to inject the error into: the number of\n"
-"\t banks in a processor varies and is family/model-specific, therefore, the\n"
-"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
-"\t injection.\n"
-"\n"
-"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
-"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
-"\t for AMD processors.\n"
-"\n"
-"\t Allowed error injection types:\n"
-"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
-"\t format only. Safe to use.\n"
-"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
-"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
-"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
-"\t before injecting.\n"
-"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
-"\t error APIC interrupt handler to handle the error if the feature is \n"
-"\t is present in hardware. \n"
-"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
-"\t APIC interrupt handler to handle the error. \n"
-"\n";
-
-static ssize_t
-inj_readme_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- return simple_read_from_buffer(ubuf, cnt, ppos,
- readme_msg, strlen(readme_msg));
-}
-
-static const struct file_operations readme_fops = {
- .read = inj_readme_read,
-};
-
-static struct dfs_node {
- char *name;
- struct dentry *d;
- const struct file_operations *fops;
- umode_t perm;
-} dfs_fls[] = {
- { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
-};
-
-static int __init init_mce_inject(void)
-{
- unsigned int i;
- u64 cap;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- n_banks = cap & MCG_BANKCNT_MASK;
-
- dfs_inj = debugfs_create_dir("mce-inject", NULL);
- if (!dfs_inj)
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
- dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
- dfs_fls[i].perm,
- dfs_inj,
- &i_mce,
- dfs_fls[i].fops);
-
- if (!dfs_fls[i].d)
- goto err_dfs_add;
- }
-
- return 0;
-
-err_dfs_add:
- while (i-- > 0)
- debugfs_remove(dfs_fls[i].d);
-
- debugfs_remove(dfs_inj);
- dfs_inj = NULL;
-
- return -ENODEV;
-}
-
-static void __exit exit_mce_inject(void)
-{
-
- debugfs_remove_recursive(dfs_inj);
- dfs_inj = NULL;
-
- memset(&dfs_fls, 0, sizeof(dfs_fls));
-}
-module_init(init_mce_inject);
-module_exit(exit_mce_inject);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Borislav Petkov <bp@xxxxxxxxx>");
-MODULE_AUTHOR("AMD Inc.");
-MODULE_DESCRIPTION("MCE injection facility for RAS testing");
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d0855c09f32f..d2c8a9286fa8 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -89,14 +89,14 @@ bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);

/*
- * All error sources notified with SCI shares one notifier function,
- * so they need to be linked and checked one by one. This is applied
- * to NMI too.
+ * All error sources notified with HED (Hardware Error Device) share a
+ * single notifier callback, so they need to be linked and checked one
+ * by one. This holds true for NMI too.
*
* RCU is used for these lists, so ghes_list_mutex is only used for
* list changing, not for traversing.
*/
-static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);

/*
@@ -702,14 +702,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data)
return IRQ_HANDLED;
}

-static int ghes_notify_sci(struct notifier_block *this,
- unsigned long event, void *data)
+static int ghes_notify_hed(struct notifier_block *this, unsigned long event,
+ void *data)
{
struct ghes *ghes;
int ret = NOTIFY_DONE;

rcu_read_lock();
- list_for_each_entry_rcu(ghes, &ghes_sci, list) {
+ list_for_each_entry_rcu(ghes, &ghes_hed, list) {
if (!ghes_proc(ghes))
ret = NOTIFY_OK;
}
@@ -718,8 +718,8 @@ static int ghes_notify_sci(struct notifier_block *this,
return ret;
}

-static struct notifier_block ghes_notifier_sci = {
- .notifier_call = ghes_notify_sci,
+static struct notifier_block ghes_notifier_hed = {
+ .notifier_call = ghes_notify_hed,
};

#ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -966,7 +966,10 @@ static int ghes_probe(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_POLLED:
case ACPI_HEST_NOTIFY_EXTERNAL:
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
break;
+
case ACPI_HEST_NOTIFY_NMI:
if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1024,13 +1027,17 @@ static int ghes_probe(struct platform_device *ghes_dev)
goto err_edac_unreg;
}
break;
+
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
mutex_lock(&ghes_list_mutex);
- if (list_empty(&ghes_sci))
- register_acpi_hed_notifier(&ghes_notifier_sci);
- list_add_rcu(&ghes->list, &ghes_sci);
+ if (list_empty(&ghes_hed))
+ register_acpi_hed_notifier(&ghes_notifier_hed);
+ list_add_rcu(&ghes->list, &ghes_hed);
mutex_unlock(&ghes_list_mutex);
break;
+
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_add(ghes);
break;
@@ -1066,14 +1073,18 @@ static int ghes_remove(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_EXTERNAL:
free_irq(ghes->irq, ghes);
break;
+
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
mutex_lock(&ghes_list_mutex);
list_del_rcu(&ghes->list);
- if (list_empty(&ghes_sci))
- unregister_acpi_hed_notifier(&ghes_notifier_sci);
+ if (list_empty(&ghes_hed))
+ unregister_acpi_hed_notifier(&ghes_notifier_hed);
mutex_unlock(&ghes_list_mutex);
synchronize_rcu();
break;
+
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_remove(ghes);
break;
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 6aab46d91d33..d0e5d6ee882c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -481,7 +481,7 @@ static int __init create_debugfs_nodes(void)

count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
&count_threshold, &count_threshold_ops);
- if (!decay) {
+ if (!count) {
pr_warn("Error creating count_threshold debugfs node!\n");
goto err;
}
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 94f8038864b4..ed4c343d08c4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -29,7 +29,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);


-int __init parse_ras_param(char *str)
+static int __init parse_ras_param(char *str)
{
#ifdef CONFIG_RAS_CEC
parse_cec_param(str);
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
index a493c7315e94..6cc1c15bcd84 100644
--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@@ -408,6 +408,8 @@ static int __init xen_late_init_mcelog(void)
if (ret)
goto deregister;

+ pr_info("/dev/mcelog registered by Xen\n");
+
return 0;

deregister: