Re: [PATCH 12/36] drivers edac add nmi ecc harvesting

From: Andrew Morton
Date: Thu Jun 07 2007 - 17:58:47 EST


On Sun, 3 Jun 2007 08:11:02 -0700 (PDT)
Doug Thompson <norsk5@xxxxxxxxx> wrote:

> From: Dave Jiang <djiang@xxxxxxxxxx>
>
> Provides a way for NMI reported errors on x86 to notify the EDAC
> subsystem pending ECC errors by writing to a software state variable.
>
> Signed-off-by: Dave Jiang <djiang@xxxxxxxxxx>
> Signed-off-by: Douglas Thompson <dougthompson@xxxxxxxxxxxx>
>

(cc Andi: this affects x86 core code)

>
> Here's the reworked patch. I added an EDAC stub to the kernel so we can
> have variables that are in the kernel even if EDAC is a module. I also
> implemented the idea of using the chip driver to select error detection
> mode via module parameter and eliminate the kernel compile option.
> Please review/test. Thx!
>
> Also, I only made changes to some of the chipset drivers since I am
> unfamiliar with the other ones. We can add similar changes as we go.
>
>
> --- linux-2.6.22-rc1.orig/arch/i386/kernel/traps.c
> +++ linux-2.6.22-rc1/arch/i386/kernel/traps.c
> @@ -41,6 +41,10 @@
> #include <linux/mca.h>
> #endif
>
> +#if defined(CONFIG_EDAC)
> +#include <linux/edac.h>
> +#endif

The ifdef here shouldn't be needed? (If it is, please fix it in edac.h)

> #include <asm/processor.h>
> #include <asm/system.h>
> #include <asm/io.h>
> @@ -635,6 +639,14 @@ mem_parity_error(unsigned char reason, s
> printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
> "CPU %d.\n", reason, smp_processor_id());
> printk(KERN_EMERG "You have some hardware problem, likely on the PCI
> bus.\n");
> +
> +#if defined(CONFIG_EDAC)
> + if(edac_handler_set()) {
> + edac_atomic_assert_error();
> + return;
> + }
> +#endif

if-is-not-a-function

> +
> if (panic_on_unrecovered_nmi)
> panic("NMI: Not continuing");
>
> Index: linux-2.6.22-rc1/arch/x86_64/kernel/traps.c
> ===================================================================
> --- linux-2.6.22-rc1.orig/arch/x86_64/kernel/traps.c
> +++ linux-2.6.22-rc1/arch/x86_64/kernel/traps.c
> @@ -34,6 +34,10 @@
> #include <linux/bug.h>
> #include <linux/kdebug.h>
>
> +#if defined(CONFIG_EDAC)
> +#include <linux/edac.h>
> +#endif
> +
> #include <asm/system.h>
> #include <asm/io.h>
> #include <asm/atomic.h>
> @@ -716,6 +720,13 @@ mem_parity_error(unsigned char reason, s
> reason);
> printk(KERN_EMERG "You have some hardware problem, likely on the PCI
> bus.\n");
>
> +#if defined(CONFIG_EDAC)
> + if(edac_handler_set()) {
> + edac_atomic_assert_error();
> + return;
> + }
> +#endif

dittoes

> /*
> + * handler for EDAC to check if NMI type handler has asserted
> interrupt
> + */
> +static int edac_assert_error_check_and_clear(void)
> +{
> + int vreg;
> +
> + if(edac_op_state == EDAC_OPSTATE_POLL)
> + return 1;
> +
> + vreg = atomic_read(&edac_err_assert);
> + if(vreg) {
> + atomic_set(&edac_err_assert, 0);
> + return 1;
> + }
> +
> + return 0;
> +}

if-is-still-not-a-function ;)

Please check all patches for this.

> --- /dev/null
> +++ linux-2.6.22-rc1/drivers/edac/edac_stub.c
> @@ -0,0 +1,42 @@
> +/*
> + * common EDAC components that must be in kernel
> + *
> + * Author: Dave Jiang <djiang@xxxxxxxxxx>
> + *
> + * 2007 (c) MontaVista Software, Inc. This file is licensed under
> + * the terms of the GNU General Public License version 2. This program
> + * is licensed "as is" without any warranty of any kind, whether
> express
> + * or implied.
> + *
> + */
> +#include <linux/module.h>
> +#include <linux/edac.h>
> +#include <asm/atomic.h>
> +#include <asm/edac.h>
> +
> +int edac_op_state = EDAC_OPSTATE_INVAL;
> +EXPORT_SYMBOL(edac_op_state);
> +
> +atomic_t edac_handlers = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(edac_handlers);
> +
> +atomic_t edac_err_assert = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(edac_err_assert);
> +
> +inline int edac_handler_set(void)
> +{
> + if (edac_op_state == EDAC_OPSTATE_POLL)
> + return 0;
> +
> + return atomic_read(&edac_handlers);
> +}
> +EXPORT_SYMBOL(edac_handler_set);
> +
> +/*
> + * handler for NMI type of interrupts to assert error
> + */
> +inline void edac_atomic_assert_error(void)
> +{
> + atomic_set(&edac_err_assert, 1);
> +}
> +EXPORT_SYMBOL(edac_atomic_assert_error);

Remove the `inline' keywords here

> Index: linux-2.6.22-rc1/drivers/edac/e752x_edac.c
> ===================================================================
> --- linux-2.6.22-rc1.orig/drivers/edac/e752x_edac.c
> +++ linux-2.6.22-rc1/drivers/edac/e752x_edac.c
> @@ -22,6 +22,7 @@
> #include <linux/pci.h>
> #include <linux/pci_ids.h>
> #include <linux/slab.h>
> +#include <linux/edac.h>
> #include "edac_mc.h"
>
> #define E752X_REVISION " Ver: 2.0.1 " __DATE__
> @@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *
> debugf0("%s(): mci\n", __func__);
> debugf0("Starting Probe1\n");
>
> + /* make sure error reporting method is sane */
> + switch(edac_op_state) {

switch-is-not-a-function-either!

> + case EDAC_OPSTATE_POLL:
> + case EDAC_OPSTATE_NMI:
> + break;
> + default:
> + edac_op_state = EDAC_OPSTATE_POLL;
> + break;
> + }
> +

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/