PATCH: memory test at boot for 2.0.34

Frank Gockel (gockel@sent13.uni-duisburg.de)
Mon, 13 Jul 1998 09:51:17 +0200 (MET DST)


Hello,

On friday, I ran again into a problem with a computer with bad RAM. It's
always the same. Windoze crashes (ehm...), Linux crashes (uh!), but old
Dos himem.sys finally finds the reason and says there's bad memory.

Why the hell does Linux not test the memory in early boot stage ??

Perhaps because nobody has written the code yet? This is no longer true :)
The diff below is against 2.0.34 and implements my first attempt of a memory
test for the i386 architecture.

It works in two passes. The first pass checks unused memory below the
magic 4MB boundary before uncompressing the kernel. If it finds a problem
then it just halts the computer with a message and lists the bad addresses.
(Well, there's not much to do in that case. The lower 4MB are used for
decompressing the kernel. If they are faulty then good night.)

The second pass is even more. It runs just after the page tables have been
setup and tests the memory above 4MB. If it detects faulty memory, the whole
page is marked reserved, which should keep the rest of the kernel away from
using it. So the computer can even run with slightly faulty RAM.

No religious wars please. I know using bad memory is not good at all, and I
know this diff is a dirty hack in some places. But Linux tests for lots of
i386 bugs (fdiv, f00f, hlt, etc.) and also for other hardware bugs (e.g.
CMD640) and even has workarounds for them. Why not do something similar
for bad memory? And yes, I know memory testing is not trivial. My test may
not be good enough for some kind of hardware. It may also get fooled by the
cache. Maybe simply the order of tests has to be changed to catch a larger
amount of possible errors. I'm not a hardware hacker. :)

And, well, at least now I _know_ when my RAM is going to die. :) It detects
the problems in our bad 32M RAM bank that made me originally write the
patch.

For those who do not like the memory test I've implemented a command line
option 'mem=notest' which can be used to switch the test off.

The new bootup messages look like this (where the dots print out as the
test proceeds so you know the computer is still running). To be honest, the
first memory test message is a bit lying - it always skips the area from
0x90000 to 0x100000 which is usually occupied by the Linux boot code, the
video adapter, the BIOS, and possibly some other pieces of hardware :

LILO Loading Linux......
Testing memory from 0x0005E8F4 to 0x00400000............................ok.
Uncompressing Linux...done.
Now booting the kernel
Testing memory from 0x00400000 to 0x02000000............................ok.
Console: 16 point font, 400 scans

Okay. Get that old computer with the faulty RAM out of the cupboard, yes
that one you never wanted to throw away :) And boot Linux. Let me know
whether it now detects RAM problems as early as possible. Just try whether
it even runs with (slightly) bad RAMs as it now avoids those pages that
have failed the test before. :))

Frank

--- linux/arch/i386/kernel/setup.c.old Sun Jul 12 19:17:28 1998
+++ linux/arch/i386/kernel/setup.c Sun Jul 12 19:20:16 1998
@@ -113,6 +113,7 @@
char c = ' ', *to = command_line, *from = COMMAND_LINE;
int len = 0;
static unsigned char smptrap=0;
+ extern int x86_do_memtest;

if(smptrap==1)
{
@@ -154,6 +155,7 @@
/*
* "mem=nopentium" disables the 4MB page tables.
* "mem=XXX[kKmM]" overrides the BIOS-reported
+ * "mem=notest" disables memory test
* memory size
*/
if (c == ' ' && *(const unsigned long *)from == *(const unsigned long *)"mem=") {
@@ -161,6 +163,9 @@
if (!memcmp(from+4, "nopentium", 9)) {
from += 9+4;
x86_capability &= ~8;
+ } else if (!memcmp(from+4, "notest", 6)) {
+ from += 6+4;
+ x86_do_memtest=0;
} else {
memory_end = simple_strtoul(from+4, &from, 0);
if ( *from == 'K' || *from == 'k' ) {
--- linux/arch/i386/mm/init.c.old Sun Jul 12 11:04:15 1998
+++ linux/arch/i386/mm/init.c Sun Jul 12 20:54:08 1998
@@ -105,6 +105,147 @@
#endif
}

+/* ARGHH. printk does not work here because init_console has not yet been
+ called. So we use outsk, a dirty hack. I really _want_ to see output. */
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+/*#define SCREEN_INFO (*(struct screen_info *)0x90000)*/
+/* setup_arch has already been called and copied this, so we need that one */
+#define SCREEN_INFO screen_info
+
+static void scrollk(char*vidmem,int cols,int lines)
+{
+ int i;
+
+ memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
+ for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
+ vidmem[i] = ' ';
+}
+
+void outsk(const char *s)
+{
+ int x,y,pos;
+ char c;
+ char *vidmem;
+ int vidport;
+
+ if (SCREEN_INFO.orig_video_mode == 7) {
+ vidmem = (char *) 0xb0000;
+ vidport = 0x3b4;
+ } else {
+ vidmem = (char *) 0xb8000;
+ vidport = 0x3d4;
+ }
+
+ x = SCREEN_INFO.orig_x;
+ y = SCREEN_INFO.orig_y;
+
+ while ( ( c = *s++ ) != '\0' ) {
+ if ( c == '\n' ) {
+ x = 0;
+ if ( ++y >= SCREEN_INFO.orig_video_lines ) {
+ scrollk(vidmem,SCREEN_INFO.orig_video_cols,
+ SCREEN_INFO.orig_video_lines);
+ y--;
+ }
+ } else {
+ vidmem [ ( x + SCREEN_INFO.orig_video_cols * y ) * 2 ] = c;
+ if ( ++x >= SCREEN_INFO.orig_video_cols ) {
+ x = 0;
+ if ( ++y >= SCREEN_INFO.orig_video_lines ) {
+ scrollk(vidmem,SCREEN_INFO.orig_video_cols,
+ SCREEN_INFO.orig_video_lines);
+ y--;
+ }
+ }
+ }
+ }
+
+ SCREEN_INFO.orig_x = x;
+ SCREEN_INFO.orig_y = y;
+
+ pos = (x + SCREEN_INFO.orig_video_cols * y) * 2; /* Update cursor position */
+ outb_p(14, vidport);
+ outb_p(0xff & (pos >> 9), vidport+1);
+ outb_p(15, vidport);
+ outb_p(0xff & (pos >> 1), vidport+1);
+}
+
+void outhexk(unsigned long n)
+{ char str[10];
+ int p=7;
+ int f;
+
+ while(p>=0)
+ { f=n&0xf;
+ if(f<10)str[p]=f+'0';else str[p]=f+'A'-10;
+ n>>=4;
+ --p;
+ }
+ str[8]='\0';
+ outsk(str);
+}
+
+int x86_do_memtest=1;
+
+int test_memory(unsigned long start, unsigned long end)
+{
+ unsigned char* address;
+ unsigned long i,j;
+ unsigned char pattern[7]={ 0xff, 0x55, 0xaa, 0xec, 0x12, 0xb3, 0x00 };
+ int p;
+ int poffset;
+ int ok=1;
+ int bad_cnt=0;
+ unsigned long pj;
+
+ if(start<0x400000)start=0x400000;
+ if(end<=start)return 0;
+
+ outsk("Testing memory from 0x");
+ outhexk(start);
+ outsk(" to 0x");
+ outhexk(end);
+
+ pj=(end-start)/2;
+ j=0;
+
+ for(poffset=0;poffset<7;++poffset)
+ {
+ /* fill memory */
+ p=poffset;
+ for(i=start;i<end;++i)
+ { address=(unsigned char*)i;
+ *address=pattern[p++];
+ if(p==7)p=0;
+ if(++j==pj){outsk(".");j=0;}
+ }
+ /* read back */
+ p=poffset;
+ for(i=start;i<end;++i)
+ { address=(unsigned char*)i;
+ if(*address!=pattern[p++])
+ { if(ok)
+ { outsk("failed at 0x");
+ }
+ ok=0;
+ if(test_bit(PG_reserved, &mem_map[MAP_NR(i)].flags)==0)
+ { outsk(" ");outhexk(i);
+ set_bit(PG_reserved, &mem_map[MAP_NR(i)].flags);
+ ++bad_cnt;
+ }
+ }
+ if(p==7)p=0;
+ if(++j==pj){outsk(".");j=0;}
+ }
+ }
+ if(ok){outsk("ok.\n");return 0;}
+
+ outsk("\ntrying not to use the bad pages (keep your fingers crossed)\n");
+ return bad_cnt;
+}
+
extern unsigned long free_area_init(unsigned long, unsigned long);

/*
@@ -292,6 +433,7 @@
}
}
local_flush_tlb();
+ if(x86_do_memtest)test_memory(start_mem, end_mem);
return free_area_init(start_mem, end_mem);
}

--- linux/arch/i386/boot/compressed/misc.c.old Sun Jul 12 14:55:49 1998
+++ linux/arch/i386/boot/compressed/misc.c Sun Jul 12 20:54:33 1998
@@ -102,7 +102,6 @@
#define RAMDISK_SIZE (*(unsigned short *)0x901F8)
#define ORIG_ROOT_DEV (*(unsigned short *)0x901FC)
#define AUX_DEVICE_INFO (*(unsigned char *)0x901FF)
-
extern char input_data[];
extern int input_len;

@@ -347,6 +346,100 @@
else mv->hcount = 0;
}

+static void puthex(unsigned long n)
+{ char str[10];
+ int p=7;
+ int f;
+
+ while(p>=0)
+ { f=n&0xf;
+ if(f<10)str[p]=f+'0';else str[p]=f+'A'-10;
+ n>>=4;
+ --p;
+ }
+ str[8]='\0';
+ puts(str);
+}
+
+int test_memory_first(unsigned long start, unsigned long end)
+{
+ unsigned char* address;
+ unsigned long i,j;
+ unsigned char pattern[7]={ 0xff, 0x55, 0xaa, 0xec, 0x12, 0xb3, 0x00 };
+ int p;
+ int poffset;
+ int ok=1;
+ unsigned long pj;
+
+ if(start<0x100000&&start>=0x90000)start=0x100000;
+ if(end<=start)return 0;
+
+ puts("Testing memory from 0x");
+ puthex(start);
+ puts(" to 0x");
+ puthex(end);
+
+ pj=(end-start-(start<0x90000?0x100000-0x90000:0))/2;
+ j=0;
+
+ for(poffset=0;poffset<7;++poffset)
+ {
+ /* fill memory */
+ p=poffset;
+ for(i=start;i<end;++i)
+ { if(i==0x90000)i=0x100000;
+ address=(unsigned char*)i;
+ *address=pattern[p++];
+ if(p==7)p=0;
+ if(++j==pj){puts(".");j=0;}
+ }
+ /* read back */
+ p=poffset;
+ for(i=start;i<end;++i)
+ { if(i==0x90000)i=0x100000;
+ address=(unsigned char*)i;
+ if(*address!=pattern[p++])
+ { if(ok)puts("failed at 0x");
+ ok=0;
+ puts(" ");puthex(i);
+ }
+ if(p==7)p=0;
+ if(++j==pj){puts(".");j=0;}
+ }
+ }
+ if(ok){puts("ok.\n");return 0;}
+ error("bad memory below 4MB detected\n");
+ return 1;
+}
+
+static int scmp(char*t1,char*t2,int n)
+{ while(n)
+ { if(*t1!=*t2)return -1;
+ --n;
+ ++t1;
+ ++t2;
+ }
+ return 0;
+}
+
+/* peek at command line whether memory check is to be disabled */
+static int check_cmdline(void)
+{ unsigned char*c;
+
+ c=(unsigned char*)0x9D800; /* by observation :) */
+
+ /*
+ puts("kernel command line is: ");
+ puts(c);
+ puts("\n");
+ */
+
+ while(*c)
+ { if(scmp(c,"mem=notest",10)==0)return 0;
+ ++c;
+ }
+ return -1;
+}

int decompress_kernel(struct moveparams *mv)
{
@@ -360,6 +453,10 @@

lines = SCREEN_INFO.orig_video_lines;
cols = SCREEN_INFO.orig_video_cols;
+
+ /* dare to test the memory which has not been occupied yet */
+ if(check_cmdline())test_memory_first( free_mem_ptr,
+ EXT_MEM_K < 3*1024 ? (EXT_MEM_K + 1024)*1024 : 0x400000 );

if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
else setup_output_buffer_if_we_run_high(mv);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.altern.org/andrebalsa/doc/lkml-faq.html