[PATCH-2] NMI trap revised (was Re: NMI errors in 2.0.30??)

Riccardo Facchetti (fizban@mbox.vol.it)
Fri, 9 May 1997 00:39:01 +0200 (MET DST)


I have implemented the memory check.
S.o. suggested that I should read instead of write. Read two times ...
ah ... heh ... I remember ... processor caches ... hmmm in the next patch
I will correct this thing :)
nghe ... I'm just curious if it works or not (heh ... I have no way to
test it). I suspect the memory test should be done in a cli()/sti() pair,
because we do not want be disturbed by NMIs not generated intentionally by
us.

Hmmm I have seen an enable_NMI or something like that in the pre-2.1.37-5
patch. It uses the bit 3 of 0x61, but my manual state that bit 3 ena/dis
I/O Channel Check while bit 2 ena/dis Memory Parity Check. Who is wrong ?
Me or the pre kernel patch ?

Mah ... here a new patch againts 2.1.36.
Is someone be able to test it ? :)
Heh ... another thing ... I hope I have understood well the
phys_to_virt/virt_to_phys mechanism :))))))

Ciao,
Riccardo.
-----------------------------
--- linux-2.1.36/arch/i386/kernel/traps.c Mon May 5 12:05:19 1997
+++ linux/arch/i386/kernel/traps.c Fri May 9 00:26:12 1997
@@ -2,6 +2,9 @@
* linux/arch/i386/traps.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-05-07 Modified do_nmi() by Riccardo Facchetti to try to display some
+ * useful information instead of the old generic message.
*/

/*
@@ -20,6 +23,7 @@
#include <linux/config.h>
#include <linux/timer.h>
#include <linux/mm.h>
+#include <linux/malloc.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
@@ -235,18 +239,167 @@
unlock_kernel();
}

+static int badmem_search_done = 0;
asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
{
- printk("NMI\n"); show_registers(regs);
+#ifndef CONFIG_IGNORE_NMI
+ unsigned int nmi_info = 0;
+ unsigned char * local_page = 0;
+ /*
+ * Start of memory is phys 0, end of memory
+ * is phys (num_physpages << PAGE_SHIFT)
+ */
+ unsigned char * page_ptr = phys_to_virt(0);
+ unsigned char * memory_end = phys_to_virt(num_physpages << PAGE_SHIFT);
+
+/*
+ * Before doing anything else, get the byte from port 0x61 (System Board
+ * I/O Port). It should contain informations about what caused NMI (at
+ * least this is what stated this old reference manual I have in front
+ * of me).
+ */
+ nmi_info = inb_p(0x61);
+#endif
+
#ifdef CONFIG_SMP_NMI_INVAL
smp_flush_tlb_rcv();
#else
#ifndef CONFIG_IGNORE_NMI
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips or a\n");
- printk("power saving mode enabled.\n");
+
+ printk("NMI received.\n");
+
+/*
+ * Test bit 7 and 6 for Memory Parity or I/O Channel error.
+ */
+ if (nmi_info & 0xC0) {
+/*
+ * This is a real error condition, sort out what error occurred.
+ */
+ if (nmi_info & 0x80) {
+ printk("RAM Parity Check: memory parity error.\n");
+
+ /*
+ * Try to sort out what memory chip is failing.
+ * With parity memory we can try this thing.
+ */
+
+ /*
+ * Disable NMI interrupts writing 1 in bit 7 of
+ * port 0x70
+ */
+ outb_p(0x80, 0x70);
+
+ /*
+ * reset the NMI memory parity error flag (bit 7)
+ * toggling bit 2 of 0x61 port to 1 and then to 0
+ */
+ nmi_info |= 4;
+ outb_p(nmi_info, 0x61);
+ nmi_info &= ~4;
+ outb_p(nmi_info, 0x61);
+
+ local_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+ /*
+ * while not all physical memory is tested:
+ * read a page from memory;
+ * test if any NMI is pending: if yes, the
+ * last page written is bogus, printk its
+ * phys address;
+ * ++page;
+ */
+ if (local_page && !badmem_search_done) {
+ /*
+ * Do it just one time.
+ */
+ badmem_search_done = 1;
+
+ printk("Testing memory:\n");
+
+ /*
+ * Should we cli()/sti() here ?
+ * We don't want to be disturbed until the
+ * memory test is in progress. Allowing the
+ * system work as usual can produce unexpected
+ * NMI errors.
+ */
+
+ while (page_ptr < memory_end) {
+ memcpy(local_page, page_ptr, PAGE_SIZE);
+ nmi_info = inb_p(0x61);
+ if (nmi_info & 0x80) {
+ printk("\tMemory Parity Error at physical page 0x%p\n", (unsigned char *)virt_to_phys(page_ptr));
+ /*
+ * Reset NMI flag and search for more
+ * parity errors.
+ */
+ nmi_info |= 4;
+ outb_p(nmi_info, 0x61);
+ nmi_info &= ~4;
+ outb_p(nmi_info, 0x61);
+ }
+ page_ptr += PAGE_SIZE;
+ }
+ }
+
+ /*
+ * Should we panic() out ?
+ * We can map the bad pages out of the used memory,
+ * but a bad page may well mean that all the chip
+ * is bad.
+ * Anyway for now let things go.
+ */
+
+ /*
+ * reset the NMI memory parity error flag (bit 7)
+ * toggling bit 2 of 0x61 port to 1 and then to 0
+ */
+ nmi_info |= 4;
+ outb_p(nmi_info, 0x61);
+ nmi_info &= ~4;
+ outb_p(nmi_info, 0x61);
+
+ /*
+ * Enable NMI interrupts writing 0 in bit 7 of
+ * port 0x70
+ */
+ outb_p(0x00, 0x70);
+ }
+
+ if (nmi_info & 0x40) {
+ printk("I/O Channel Check: I/O channel adapter error.\n");
+ /*
+ * I don't have a clue on how to have more
+ * informations about what is failing here.
+ * Let things go.
+ */
+
+ /*
+ * reset the NMI I/O channel adapter error flag (bit 6)
+ * toggling bit 3 of 0x61 port to 1 and then to 0
+ */
+ nmi_info |= 8;
+ outb_p(nmi_info, 0x61);
+ nmi_info &= ~8;
+ outb_p(nmi_info, 0x61);
+ }
+/*
+ * May be safer panic here ? We have an error on memory chip or on I/O channel
+ * adapter, so if we care our data, we should stop all the things now.
+ */
+ } else {
+/*
+ * Huh ?? unexpected NMI !!
+ */
+
+ printk("Uhhuh. Dazed and confused, but trying to continue.\n");
+ printk("You probably have a power saving mode enabled.\n");
+ }
+
#endif
#endif
+
+ show_registers(regs);
}

asmlinkage void do_debug(struct pt_regs * regs, long error_code)