[PATCH 06/10] x86, mce: make mce_log buffer to per-CPU

From: Hidetoshi Seto
Date: Mon Oct 05 2009 - 02:40:40 EST


On larger systems the global 32 size buffer for mcelog easily overflow,
lose events. And there's a known livelock, now hit by more people,
under high error rate.

This patch fixes these issues by making MCE log buffer to per-CPU:

+ MCEs are added to corresponding local per-CPU buffer, instead of
one big global buffer. Contention/unfairness between CPUs is
eliminated.

Reader/Writer convention is unchanged (= Lock-less for writer side):

+ MCE log writer may come from NMI, so the writer side must be
lock-less. For per-CPU buffer of one CPU, writers may come from
process, IRQ or NMI context, so cmpxchg_local() is used to allocate
buffer space.

+ MCE records are read out and removed from per-CPU buffers by mutex
protected global reader function. Because there are no many
readers in system to contend in most cases. In other words,
reader side is protected with a mutex to guarantee only one reader
is active in the whole system.

As the result now each CPU has its local 32 size buffer.

HS: Add a member header_len to struct mce_log to help debugger to know
where the array of record is.

(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")

Originally-From: Huang Ying <ying.huang@xxxxxxxxx>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@xxxxxxxxxxxxxx>
---
arch/x86/include/asm/mce.h | 37 ++++++----
arch/x86/kernel/cpu/mcheck/mce.c | 139 +++++++++++++++++++++++++++-----------
2 files changed, 120 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2f1c0ef..c5d4144 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -52,7 +52,7 @@
#define MCE_INJ_NMI_BROADCAST (1 << 2) /* do NMI broadcasting */
#define MCE_INJ_EXCEPTION (1 << 3) /* raise as exception */

-/* Fields are zero when not available */
+/* MCE log entry. Fields are zero when not available. */
struct mce {
__u64 status;
__u64 misc;
@@ -63,12 +63,12 @@ struct mce {
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
- __u16 pad;
+ __u16 pad;
__u32 cpuid; /* CPUID 1 EAX */
- __u8 cs; /* code segment */
+ __u8 cs; /* code segment */
__u8 bank; /* machine check bank */
__u8 cpu; /* cpu number; obsolete; use extcpu now */
- __u8 finished; /* entry is valid */
+ __u8 finished; /* 1 if write to entry is finished & entry is valid */
__u32 extcpu; /* linux cpu number that detected the error */
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
@@ -76,26 +76,33 @@ struct mce {
};

/*
- * This structure contains all data related to the MCE log. Also
- * carries a signature to make it easier to find from external
- * debugging tools. Each entry is only valid when its finished flag
- * is set.
+ * This structure contains all data related to the MCE log. Also carries
+ * a signature to make it easier to find from external debugging tools.
+ * Each entry is only valid when its finished flag is set.
*/

-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN 32
+
+struct mce_log_cpu;

struct mce_log {
- char signature[12]; /* "MACHINECHECK" */
- unsigned len; /* = MCE_LOG_LEN */
- unsigned next;
+ char signature[12]; /* "MACHINECHEC2" */
+
+ /* points the table of per-CPU buffers */
+ struct mce_log_cpu **mcelog_cpus;
+ unsigned int nr_mcelog_cpus; /* = num_possible_cpus() */
+
+ /* spec of per-CPU buffer */
+ unsigned int header_len; /* offset of array "entry" */
+ unsigned int nr_record; /* array size (= MCE_LOG_LEN) */
+ unsigned int record_len; /* length of struct mce */
+
unsigned flags;
- unsigned recordlen; /* length of struct mce */
- struct mce entry[MCE_LOG_LEN];
};

#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */

-#define MCE_LOG_SIGNATURE "MACHINECHECK"
+#define MCE_LOG_SIGNATURE "MACHINECHEC2"

#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 684b42e..ad2eb89 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -122,21 +122,30 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
* separate MCEs from kernel messages to avoid bogus bug reports.
*/

+struct mce_log_cpu {
+ unsigned next;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
static struct mce_log mcelog = {
.signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
+ .header_len = offsetof(struct mce_log_cpu, entry),
+ .nr_record = MCE_LOG_LEN,
+ .record_len = sizeof(struct mce),
};

void mce_log(struct mce *mce)
{
+ struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
unsigned next, entry;

mce->finished = 0;
wmb();

do {
- entry = rcu_dereference(mcelog.next);
+ entry = mcelog_cpu->next;
for (;;) {
/*
* When the buffer fills up discard new entries.
@@ -149,7 +158,7 @@ void mce_log(struct mce *mce)
return;
}
/* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
+ if (mcelog_cpu->entry[entry].finished) {
entry++;
continue;
}
@@ -157,12 +166,12 @@ void mce_log(struct mce *mce)
}
smp_rmb();
next = entry + 1;
- } while (cmpxchg(&mcelog.next, entry, next) != entry);
+ } while (cmpxchg_local(&mcelog_cpu->next, entry, next) != entry);

- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));

wmb();
- mcelog.entry[entry].finished = 1;
+ mcelog_cpu->entry[entry].finished = 1;
wmb();
mce->finished = 1;
set_bit(0, &mce_need_notify);
@@ -210,6 +219,26 @@ static void print_mce_tail(void)
"Run through mcelog --ascii to decode and contact your hardware vendor\n");
}

+static void print_mce_cpu(int cpu, struct mce *final, u64 mask, u64 res)
+{
+ int i;
+ struct mce_log_cpu *mcelog_cpu;
+
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog_cpu->entry[i];
+ if (!m->finished)
+ continue;
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if ((m->status & mask) != res)
+ continue;
+ if (final && !memcmp(m, final, sizeof(struct mce)))
+ continue;
+ print_mce(m);
+ }
+}
+
#define PANIC_TIMEOUT 5 /* 5 seconds */

static atomic_t mce_paniced;
@@ -232,7 +261,7 @@ static void wait_for_panic(void)

static void mce_panic(char *msg, struct mce *final, char *exp)
{
- int i;
+ int cpu;

if (!fake_panic) {
/*
@@ -251,23 +280,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
}
print_mce_head();
/* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- print_mce(m);
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
- print_mce(m);
- }
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, 0);
+ /* Print uncorrected but without the final one */
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, MCI_STATUS_UC);
+ /* Finally print the final mce */
if (final)
print_mce(final);
if (cpu_missing)
@@ -1234,6 +1252,22 @@ static int __cpuinit mce_cap_init(void)
return 0;
}

+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+ int cpu;
+
+ if (mcelog.mcelog_cpus)
+ return;
+ mcelog.nr_mcelog_cpus = num_possible_cpus();
+ mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+ GFP_KERNEL);
+ for_each_possible_cpu(cpu)
+ mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+}
+
static void mce_init(void)
{
mce_banks_t all_banks;
@@ -1404,6 +1438,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
mce_disabled = 1;
return;
}
+ mce_log_init();

machine_check_vector = do_machine_check;

@@ -1452,13 +1487,16 @@ static int mce_release(struct inode *inode, struct file *file)
return 0;
}

-static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
+static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
{
+ struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
char __user *ubuf = inubuf;
unsigned prev, next;
int i, err;

- next = rcu_dereference(mcelog.next);
+ next = mcelog_cpu->next;
+ if (!next)
+ return 0;

err = 0;
prev = 0;
@@ -1466,9 +1504,9 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
for (i = prev; i < next; i++) {
int timeout = WRITER_TIMEOUT_NS;

- while (!mcelog.entry[i].finished) {
+ while (!mcelog_cpu->entry[i].finished) {
if (timeout-- <= 0) {
- memset(mcelog.entry + i, 0,
+ memset(mcelog_cpu->entry + i, 0,
sizeof(struct mce));
printk(KERN_WARNING "mcelog: timeout "
"waiting for writer to finish!\n");
@@ -1477,27 +1515,33 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
ndelay(1);
}
smp_rmb();
- err |= copy_to_user(ubuf, mcelog.entry + i,
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
sizeof(struct mce));
ubuf += sizeof(struct mce);
timeout:
;
}

- memset(mcelog.entry + prev, 0,
+ memset(mcelog_cpu->entry + prev, 0,
(next - prev) * sizeof(struct mce));
prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
+ next = cmpxchg(&mcelog_cpu->next, prev, 0);
} while (next != prev);

- synchronize_sched();
-
return err ? -EFAULT : ubuf - inubuf;
}

static int mce_empty(void)
{
- return !rcu_dereference(mcelog.next);
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
+
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ if (mcelog_cpu->next)
+ return 0;
+ }
+ return 1;
}

static DEFINE_MUTEX(mce_read_mutex);
@@ -1506,7 +1550,7 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
loff_t *off)
{
char __user *ubuf = inubuf;
- int err;
+ int cpu, err = 0;

/* Only supports full reads right now */
if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
@@ -1514,12 +1558,25 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,

mutex_lock(&mce_read_mutex);

- err = mce_read_buf(ubuf, usize);
- if (err > 0) {
- ubuf += err;
- err = 0;
+ while (!mce_empty()) {
+ for_each_possible_cpu(cpu) {
+ if (usize < MCE_LOG_LEN * sizeof(struct mce))
+ goto out;
+ err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
+ if (err > 0) {
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ err = 0;
+ } else if (err < 0)
+ goto out;
+ }
+ if (need_resched()) {
+ mutex_unlock(&mce_read_mutex);
+ cond_resched();
+ mutex_lock(&mce_read_mutex);
+ }
}
-
+out:
mutex_unlock(&mce_read_mutex);

return err ? err : ubuf - inubuf;
--
1.6.4.3


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/