[PATCH 4/4] x86/mce: Deprecate /dev/mcelog

From: Borislav Petkov
Date: Thu Mar 09 2017 - 05:54:01 EST


From: Tony Luck <tony.luck@xxxxxxxxx>

Move all code relating to /dev/mcelog to a separate source file.
/dev/mcelog driver can now operate from the machine check notifier with
lowest prio.

Boris:
* Move the mce_helper and trigger functionality behind
CONFIG_X86_MCELOG.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
---
arch/x86/Kconfig | 10 +-
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mcheck/Makefile | 2 +
arch/x86/kernel/cpu/mcheck/dev-mcelog.c | 397 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/mcheck/mce-internal.h | 8 +
arch/x86/kernel/cpu/mcheck/mce.c | 367 +--------------------------
6 files changed, 426 insertions(+), 359 deletions(-)
create mode 100644 arch/x86/kernel/cpu/mcheck/dev-mcelog.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..f1216d0114b0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1043,6 +1043,14 @@ config X86_MCE
The action the kernel takes depends on the severity of the problem,
ranging from warning messages to halting the machine.

+config X86_MCELOG
+ bool "Support for deprecated /dev/mcelog character device"
+ depends on X86_MCE
+ ---help---
+ Enable support for /dev/mcelog which is needed by the old mcelog
+ userspace logging daemon. Consider switching to the new generation
+ rasdaemon solution.
+
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
@@ -1072,7 +1080,7 @@ config X86_MCE_THRESHOLD
def_bool y

config X86_MCE_INJECT
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG
tristate "Machine check injector support"
---help---
Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c5ae545d27d8..4fd5195deed0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -196,6 +196,7 @@ enum mce_notifier_prios {
MCE_PRIO_EXTLOG = INT_MAX - 2,
MCE_PRIO_NFIT = INT_MAX - 3,
MCE_PRIO_EDAC = INT_MAX - 4,
+ MCE_PRIO_MCELOG = 1,
MCE_PRIO_LOWEST = 0,
};

diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index a3311c886194..950e9ff602ea 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o

obj-$(CONFIG_ACPI_APEI) += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000000..9c632cb88546
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,397 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+#define mce_log_get_idx_check(p) \
+({ \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious mce_log_get_idx_check() usage"); \
+ smp_load_acquire(&(p)); \
+})
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int next, entry;
+
+ wmb();
+ for (;;) {
+ entry = mce_log_get_idx_check(mcelog.next);
+ for (;;) {
+
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
+ return NOTIFY_OK;
+ }
+ /* Old left over entry. Skip: */
+ if (mcelog.entry[entry].finished) {
+ entry++;
+ continue;
+ }
+ break;
+ }
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ wmb();
+ mcelog.entry[entry].finished = 1;
+ wmb();
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+
+ cpu_tsc[smp_processor_id()] = rdtsc();
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned long *cpu_tsc;
+ unsigned prev, next;
+ int i, err;
+
+ cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+ if (!cpu_tsc)
+ return -ENOMEM;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ next = mce_log_get_idx_check(mcelog.next);
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
+
+ err = 0;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(m, 0, sizeof(*m));
+ goto timeout;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+timeout:
+ ;
+ }
+
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+
+ synchronize_sched();
+
+ /*
+ * Collect entries that were still getting written before the
+ * synchronize.
+ */
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
+ smp_rmb();
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
+ }
+ }
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+ kfree(cpu_tsc);
+
+ return err ? err : buf - ubuf;
+}
+
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off))
+{
+ mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ if (mce_write)
+ return mce_write(filp, ubuf, usize, off);
+ else
+ return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int err;
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ return err;
+ }
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 903043e6a62b..f096d64485d0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
m1->addr != m2->addr ||
m1->misc != m2->misc;
}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void) { }
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index cd750cfbcb93..1be00932ac47 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -54,15 +54,7 @@

#include "mce-internal.h"

-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-#define mce_log_get_idx_check(p) \
-({ \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
- !lockdep_is_held(&mce_chrdev_read_mutex), \
- "suspicious mce_log_get_idx_check() usage"); \
- smp_load_acquire(&(p)); \
-})
+static DEFINE_MUTEX(mce_log_mutex);

#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -87,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};

-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
+static unsigned long mce_need_notify;
+static int cpu_missing;

/*
* MCA banks polled by the period polling timer for corrected events.
@@ -145,18 +131,6 @@ void mce_setup(struct mce *m)
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);

-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log_buffer mcelog_buf = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
void mce_log(struct mce *m)
{
if (!mce_gen_pool_add(m))
@@ -165,9 +139,9 @@ void mce_log(struct mce *m)

void mce_inject_log(struct mce *m)
{
- mutex_lock(&mce_chrdev_read_mutex);
+ mutex_lock(&mce_log_mutex);
mce_log(m);
- mutex_unlock(&mce_chrdev_read_mutex);
+ mutex_unlock(&mce_log_mutex);
}
EXPORT_SYMBOL_GPL(mce_inject_log);

@@ -577,7 +551,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
- unsigned int next, entry;

if (!m)
return NOTIFY_DONE;
@@ -588,38 +561,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
/* Emit the trace record: */
trace_mce_record(m);

- wmb();
- for (;;) {
- entry = mce_log_get_idx_check(mcelog_buf.next);
- for (;;) {
-
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog_buf.flags);
- return NOTIFY_DONE;
- }
- /* Old left over entry. Skip: */
- if (mcelog_buf.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
- wmb();
- mcelog_buf.entry[entry].finished = 1;
- wmb();
-
set_bit(0, &mce_need_notify);

mce_notify_irq();
@@ -1447,13 +1388,6 @@ static void mce_timer_delete_all(void)
del_timer_sync(&per_cpu(mce_timer, cpu));
}

-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
/*
* Notify the user(s) about new machine check events.
* Can be called from interrupt context, but not from machine check/NMI
@@ -1465,11 +1399,7 @@ int mce_notify_irq(void)
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);

if (test_and_clear_bit(0, &mce_need_notify)) {
- /* wake processes polling /dev/mcelog */
- wake_up_interruptible(&mce_chrdev_wait);
-
- if (mce_helper[0])
- schedule_work(&mce_trigger_work);
+ mce_work_trigger();

if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
@@ -1871,252 +1801,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)

}

-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_count; /* #times opened */
-static int mce_chrdev_open_exclu; /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- if (mce_chrdev_open_exclu ||
- (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_chrdev_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- mce_chrdev_open_exclu = 1;
- mce_chrdev_open_count++;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- mce_chrdev_open_count--;
- mce_chrdev_open_exclu = 0;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- cpu_tsc[smp_processor_id()] = rdtsc();
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
- int rc;
- u64 record_id;
- struct mce m;
-
- if (usize < sizeof(struct mce))
- return -EINVAL;
-
- rc = apei_read_mce(&m, &record_id);
- /* Error or no more MCE record */
- if (rc <= 0) {
- mce_apei_read_done = 1;
- /*
- * When ERST is disabled, mce_chrdev_read() should return
- * "no record" instead of "no device."
- */
- if (rc == -ENODEV)
- return 0;
- return rc;
- }
- rc = -EFAULT;
- if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
- return rc;
- /*
- * In fact, we should have cleared the record after that has
- * been flushed to the disk or sent to network in
- * /sbin/mcelog, but we have no interface to support that now,
- * so just clear it to avoid duplication.
- */
- rc = apei_clear_mce(record_id);
- if (rc) {
- mce_apei_read_done = 1;
- return rc;
- }
- *ubuf += sizeof(struct mce);
-
- return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
- size_t usize, loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_chrdev_read_mutex);
-
- if (!mce_apei_read_done) {
- err = __mce_read_apei(&buf, usize);
- if (err || buf != ubuf)
- goto out;
- }
-
- next = mce_log_get_idx_check(mcelog_buf.next);
-
- /* Only supports full reads right now */
- err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
- goto out;
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- struct mce *m = &mcelog_buf.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(m, 0, sizeof(*m));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, m, sizeof(*m));
- buf += sizeof(*m);
-timeout:
- ;
- }
-
- memset(mcelog_buf.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog_buf.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog_buf.entry[i];
-
- if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
- err |= copy_to_user(buf, m, sizeof(*m));
- smp_rmb();
- buf += sizeof(*m);
- memset(m, 0, sizeof(*m));
- }
- }
-
- if (err)
- err = -EFAULT;
-
-out:
- mutex_unlock(&mce_chrdev_read_mutex);
- kfree(cpu_tsc);
-
- return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_chrdev_wait, wait);
- if (READ_ONCE(mcelog_buf.next))
- return POLLIN | POLLRDNORM;
- if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
- unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog_buf.flags;
- } while (cmpxchg(&mcelog_buf.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
-{
- mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
- return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_chrdev_open,
- .release = mce_chrdev_release,
- .read = mce_chrdev_read,
- .write = mce_chrdev_write,
- .poll = mce_chrdev_poll,
- .unlocked_ioctl = mce_chrdev_ioctl,
- .llseek = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
static void __mce_disable_bank(void *arg)
{
int bank = *((int *)arg);
@@ -2335,29 +2019,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return size;
}

-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
static ssize_t set_ignore_ce(struct device *s,
struct device_attribute *attr,
const char *buf, size_t size)
@@ -2414,7 +2075,6 @@ static ssize_t store_int_with_restart(struct device *s,
return ret;
}

-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2437,7 +2097,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG
&dev_attr_trigger,
+#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
@@ -2611,7 +2273,6 @@ static __init void mce_init_banks(void)

static __init int mcheck_init_device(void)
{
- enum cpuhp_state hp_online;
int err;

if (!mce_available(&boot_cpu_data)) {
@@ -2639,21 +2300,11 @@ static __init int mcheck_init_device(void)
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
goto err_out_online;
- hp_online = err;

register_syscore_ops(&mce_syscore_ops);

- /* register character device /dev/mcelog */
- err = misc_register(&mce_chrdev_device);
- if (err)
- goto err_register;
-
return 0;

-err_register:
- unregister_syscore_ops(&mce_syscore_ops);
- cpuhp_remove_state(hp_online);
-
err_out_online:
cpuhp_remove_state(CPUHP_X86_MCE_DEAD);

@@ -2661,7 +2312,7 @@ static __init int mcheck_init_device(void)
free_cpumask_var(mce_device_initialized);

err_out:
- pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ pr_err("Unable to init MCE device (rc: %d)\n", err);

return err;
}
--
2.11.0