Re: [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer

From: Hidetoshi Seto
Date: Mon Oct 05 2009 - 03:09:06 EST


This is the diff between Huang's original change and the result of changes
by my patch set.

I'll going to explain what I changed from the original.

> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 2d5c42a..4b5ef3c 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -52,7 +52,7 @@
> #define MCE_INJ_NMI_BROADCAST (1 << 2) /* do NMI broadcasting */
> #define MCE_INJ_EXCEPTION (1 << 3) /* raise as exception */
>
> -/* Fields are zero when not available */
> +/* MCE log entry. Fields are zero when not available. */
> struct mce {
> __u64 status;
> __u64 misc;
> @@ -63,12 +63,12 @@ struct mce {
> __u64 time; /* wall time_t when error was detected */
> __u8 cpuvendor; /* cpu vendor as encoded in system.h */
> __u8 inject_flags; /* software inject flags */
> - __u16 pad;
> + __u16 pad;
> __u32 cpuid; /* CPUID 1 EAX */
> - __u8 cs; /* code segment */
> + __u8 cs; /* code segment */
> __u8 bank; /* machine check bank */
> __u8 cpu; /* cpu number; obsolete; use extcpu now */
> - __u8 finished; /* entry is valid */
> + __u8 finished; /* 1 if write to entry is finished & entry is valid */
> __u32 extcpu; /* linux cpu number that detected the error */
> __u32 socketid; /* CPU socket ID */
> __u32 apicid; /* CPU initial apic ID */
> @@ -76,10 +76,9 @@ struct mce {
> };
>
> /*
> - * This structure contains all data related to the MCE log. Also
> - * carries a signature to make it easier to find from external
> - * debugging tools. Each entry is only valid when its finished flag
> - * is set.
> + * This structure contains all data related to the MCE log. Also carries
> + * a signature to make it easier to find from external debugging tools.
> + * Each entry is only valid when its finished flag is set.
> */

Small clean up.

>
> #define MCE_LOG_LEN 32
> @@ -93,12 +92,18 @@ static inline int mce_log_index(int n)
> struct mce_log_cpu;
>
> struct mce_log {
> - char signature[12]; /* "MACHINECHEC2" */
> - unsigned len; /* = MCE_LOG_LEN */
> - unsigned flags;
> - unsigned recordlen; /* length of struct mce */
> - unsigned nr_mcelog_cpus;
> + char signature[12]; /* "MACHINECHEC2" */
> +
> + /* points the table of per-CPU buffers */
> struct mce_log_cpu **mcelog_cpus;
> + unsigned int nr_mcelog_cpus; /* = num_possible_cpus() */
> +
> + /* spec of per-CPU buffer */
> + unsigned int header_len; /* offset of array "entry" */
> + unsigned int nr_record; /* array size (= MCE_LOG_LEN) */
> + unsigned int record_len; /* length of struct mce */
> +
> + unsigned flags;
> };

Add a member header_len, and renamed "len" to "nr_record" which is easier
to understand. Please refer the description of patch 6/10.

>
> #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 5db3f5b..fad3daa 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -122,54 +122,53 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
> * separate MCEs from kernel messages to avoid bogus bug reports.
> */
>
> -static struct mce_log mcelog = {
> - .signature = MCE_LOG_SIGNATURE,
> - .len = MCE_LOG_LEN,
> - .recordlen = sizeof(struct mce),
> -};
> -
> struct mce_log_cpu {
> int head;
> int tail;
> - unsigned long flags;
> struct mce entry[MCE_LOG_LEN];
> };

Removed "flags" from per-CPU structure since mce_ioctl only cares global flags
in struct mce_log.

>
> DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
>
> +static struct mce_log mcelog = {
> + .signature = MCE_LOG_SIGNATURE,
> + .header_len = offsetof(struct mce_log_cpu, entry),
> + .nr_record = MCE_LOG_LEN,
> + .record_len = sizeof(struct mce),
> +};
> +
> void mce_log(struct mce *mce)
> {
> struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
> int head, ihead, tail, next;
>
> + /* mce->finished must be set to 0 before written to buffer */
> mce->finished = 0;
> - /*
> - * mce->finished must be set to 0 before written to ring
> - * buffer
> - */
> smp_wmb();
> +
> do {
> head = mcelog_cpu->head;
> tail = mcelog_cpu->tail;
> ihead = mce_log_index(head);
> +
> /*
> * When the buffer fills up discard new entries.
> * Assume that the earlier errors are the more
> * interesting.
> */
> if (ihead == mce_log_index(tail) && head != tail) {
> - set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
> + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);

Use global flags which is cared by mce_ioctl.

> return;
> }
> next = head == MCE_LOG_LIMIT ? 0 : head + 1;
> } while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
> +
> memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
> - /*
> - * ".finished" of MCE record in ring buffer must be set after
> - * copy
> - */
> +
> + /* ".finished" of MCE record in buffer must be set after copy */
> smp_wmb();
> mcelog_cpu->entry[ihead].finished = 1;
> +
> /* bit 0 of notify_user should be set after finished be set */
> smp_wmb();
> mce->finished = 1;


Changes from here to ....

> @@ -1195,19 +1194,40 @@ int mce_notify_irq(void)
> }
> EXPORT_SYMBOL_GPL(mce_notify_irq);
>
> -static int mce_banks_init(void)
> +static __cpuinit int mce_banks_init(void)
> {
> int i;
>
> mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
> if (!mce_banks)
> return -ENOMEM;
> +
> for (i = 0; i < banks; i++) {
> struct mce_bank *b = &mce_banks[i];
>
> b->ctl = -1ULL;
> b->init = 1;
> }
> +
> + return 0;
> +}
> +
> +/*
> + * Initialize MCE per-CPU log buffer
> + */
> +static __cpuinit int mce_log_init(void)
> +{
> + int cpu;
> +
> + mcelog.nr_mcelog_cpus = num_possible_cpus();
> + mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
> + GFP_KERNEL);
> + if (!mcelog.mcelog_cpus)
> + return -ENOMEM;
> +
> + for_each_possible_cpu(cpu)
> + mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> +
> return 0;
> }
>
> @@ -1234,6 +1254,7 @@ static int __cpuinit mce_cap_init(void)
> /* Don't support asymmetric configurations today */
> WARN_ON(banks != 0 && b != banks);
> banks = b;
> +
> if (!mce_banks) {
> int err = mce_banks_init();
>
> @@ -1241,6 +1262,13 @@ static int __cpuinit mce_cap_init(void)
> return err;
> }
>
> + if (!mcelog.mcelog_cpus) {
> + int err = mce_log_init();
> +
> + if (err)
> + return err;
> + }
> +
> /* Use accurate RIP reporting if available. */
> if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
> rip_msr = MSR_IA32_MCG_EIP;
> @@ -1251,22 +1279,6 @@ static int __cpuinit mce_cap_init(void)
> return 0;
> }
>
> -/*
> - * Initialize MCE per-CPU log buffer
> - */
> -static __cpuinit void mce_log_init(void)
> -{
> - int cpu;
> -
> - if (mcelog.mcelog_cpus)
> - return;
> - mcelog.nr_mcelog_cpus = num_possible_cpus();
> - mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
> - GFP_KERNEL);
> - for_each_possible_cpu(cpu)
> - mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> -}
> -
> static void mce_init(void)
> {
> mce_banks_t all_banks;
> @@ -1437,7 +1449,6 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
> mce_disabled = 1;
> return;
> }
> - mce_log_init();
>
> machine_check_vector = do_machine_check;
>

... here are what done in patch 10/10.

> @@ -1486,15 +1497,14 @@ static int mce_release(struct inode *inode, struct file *file)
> return 0;
> }
>
> -static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
> - char __user *inubuf, size_t usize)
> +static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
> {
> + struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);

Changed 1st arg to cpu number.

> char __user *ubuf = inubuf;
> int head, tail, pos, i, err = 0;
>
> head = mcelog_cpu->head;
> tail = mcelog_cpu->tail;
> -
> if (head == tail)
> return 0;
>
> @@ -1503,13 +1513,14 @@ static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
> i = mce_log_index(pos);
> if (!mcelog_cpu->entry[i].finished) {
> int timeout = WRITER_TIMEOUT_NS;
> +
> while (!mcelog_cpu->entry[i].finished) {
> if (timeout-- <= 0) {
> memset(mcelog_cpu->entry + i, 0,
> sizeof(struct mce));
> head = mcelog_cpu->head;
> printk(KERN_WARNING "mcelog: timeout "
> - "waiting for writer to finish!\n");
> + "waiting for writer to finish!\n");
> goto timeout;
> }
> ndelay(1);
> @@ -1538,11 +1549,6 @@ timeout:
> return err ? -EFAULT : ubuf - inubuf;
> }
>
> -static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
> -{
> - return mcelog_cpu->head == mcelog_cpu->tail;
> -}
> -
> static int mce_empty(void)
> {
> int cpu;
> @@ -1550,33 +1556,34 @@ static int mce_empty(void)
>
> for_each_possible_cpu(cpu) {
> mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
> - if (!mce_empty_cpu(mcelog_cpu))
> + if (mcelog_cpu->head != mcelog_cpu->tail)

Inlined. Or it would be better to have static inlines in header file.

> return 0;
> }
> return 1;
> }
>
> +static DEFINE_MUTEX(mce_read_mutex);
> +
> static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> loff_t *off)
> {
> char __user *ubuf = inubuf;
> - struct mce_log_cpu *mcelog_cpu;
> - int cpu, new_mce, err = 0;
> - static DEFINE_MUTEX(mce_read_mutex);
> + int cpu, err = 0;
> +
> + /* Only supports full reads right now */
> + if (*off != 0 || usize < sizeof(struct mce))
> + return -EINVAL;

Picked up what implicitly dropped.

>
> mutex_lock(&mce_read_mutex);
> - do {
> - new_mce = 0;
> +
> + while (!mce_empty()) {
> for_each_possible_cpu(cpu) {
> if (usize < sizeof(struct mce))
> goto out;
> - mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
> - err = mce_read_cpu(mcelog_cpu, ubuf,
> - sizeof(struct mce));
> + err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
> if (err > 0) {
> ubuf += sizeof(struct mce);
> usize -= sizeof(struct mce);
> - new_mce = 1;
> err = 0;
> } else if (err < 0)
> goto out;
> @@ -1586,9 +1593,10 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> cond_resched();
> mutex_lock(&mce_read_mutex);
> }
> - } while (new_mce || !mce_empty());
> + }

I could not catch the intent of "new_mce," so replaced do-while by
simple while statement.

> out:
> mutex_unlock(&mce_read_mutex);
> +
> return err ? err : ubuf - inubuf;
> }
>

That's all.

Thanks,
H.Seto

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/