[Patch] NMI-watchdog 2.1.88-A

MOLNAR Ingo (mingo@chiara.csoma.elte.hu)
Sat, 21 Feb 1998 18:59:30 +0100 (CET)


[ FAQ: the NMI watchdog generates oopses if an SMP system locks up on any
CPU, even when irqs are disabled. The ultimate hacker tool, now that
spinlocks have invaded SMP Linux ;)]

Changes:

- ported to 2.1.88

- new feature: exact call trace printout, oops messages now only
includes 'true' function entry points. (this is only activated
if the NMI watchdog is turned on). Call frames are added to IRQ,
NMI and trap/exception handlers too. Exact call traces are
easier to analyze and, most important, easier to write down from
screen ;) (cough).

- bugfix: rare lockup on pentium SMP systems fixed (the
double-write APIC errata was not NMI-safe)

- bugfix: deadlock treshold is increased to 5 seconds, this way we
wont generate false alarms when a driver disables IRQs for
longer time during initialization.

- NOTE: if a deadlock is found on a CPU, then an oops is printed
on every CPU, and all CPUs are halted. Make sure to disable
system loggers before debugging, or remove the for (;;) __cli();
line from traps.c.

the patch has been tested with 2.1.88, compiles/boots with both enabled
and disabled watchdog.

-- mingo

--- linux/drivers/char/Config.in.orig Wed Feb 18 10:17:03 1998
+++ linux/drivers/char/Config.in Fri Feb 27 05:39:29 1998
@@ -103,6 +103,12 @@
tristate ' Software Watchdog' CONFIG_SOFT_WATCHDOG
tristate ' Berkshire Products PC Watchdog' CONFIG_PCWATCHDOG
tristate ' Acquire SBC Watchdog Timer' CONFIG_ACQUIRE_WDT
+ if [ "$SMP" = "1" ]; then
+ bool ' SMP-IOAPIC NMI Software Watchdog' CONFIG_NMI_WATCHDOG
+ if [ "$CONFIG_NMI_WATCHDOG" = "y" ]; then
+ int ' watchdog source IRQ' CONFIG_NMI_WATCHDOG_IRQ 0
+ fi
+ fi
fi
bool 'Enhanced Real Time Clock Support' CONFIG_RTC
if [ "$CONFIG_RTC" = "y" ]; then
--- linux/include/asm-i386/ptrace.h.orig Sat Jan 3 09:43:51 1998
+++ linux/include/asm-i386/ptrace.h Fri Feb 27 06:06:38 1998
@@ -1,6 +1,8 @@
#ifndef _I386_PTRACE_H
#define _I386_PTRACE_H

+#include <linux/config.h>
+
#define EBX 0
#define ECX 1
#define EDX 2
@@ -13,11 +15,20 @@
#define FS 9
#define GS 10
#define ORIG_EAX 11
+
+#if CONFIG_NMI_WATCHDOG
+#define EIP 13
+#define CS 14
+#define EFL 15
+#define UESP 16
+#define SS 17
+#else
#define EIP 12
#define CS 13
#define EFL 14
#define UESP 15
#define SS 16
+#endif


/* this struct defines the way the registers are stored on the
@@ -34,6 +45,9 @@
int xds;
int xes;
long orig_eax;
+#if CONFIG_NMI_WATCHDOG
+ long frame;
+#endif
long eip;
int xcs;
long eflags;
--- linux/arch/i386/kernel/entry.S.orig Wed Feb 18 10:17:01 1998
+++ linux/arch/i386/kernel/entry.S Fri Feb 27 05:39:29 1998
@@ -42,6 +42,7 @@

#include <linux/sys.h>
#include <linux/linkage.h>
+#include <linux/config.h>
#include <asm/segment.h>
#define ASSEMBLY
#include <asm/smp.h>
@@ -56,11 +57,21 @@
DS = 0x1C
ES = 0x20
ORIG_EAX = 0x24
-EIP = 0x28
-CS = 0x2C
-EFLAGS = 0x30
-OLDESP = 0x34
-OLDSS = 0x38
+
+
+#if CONFIG_NMI_WATCHDOG
+ EIP = 0x2C
+ CS = 0x30
+ EFLAGS = 0x34
+ OLDESP = 0x38
+ OLDSS = 0x3C
+#else
+ EIP = 0x28
+ CS = 0x2C
+ EFLAGS = 0x30
+ OLDESP = 0x34
+ OLDSS = 0x38
+#endif

CF_MASK = 0x00000001
IF_MASK = 0x00000200
@@ -78,6 +89,37 @@

ENOSYS = 38

+#undef IRQ_ENTRY
+
+#if CONFIG_NMI_WATCHDOG
+#define IRQ_ENTRY(name) \
+ .globl SYMBOL_NAME(name); \
+ ALIGN; \
+ SYMBOL_NAME_LABEL(name) \
+ pushl %ebp; \
+ movl %esp, %ebp;
+#else
+#define IRQ_ENTRY(name) \
+ .globl SYMBOL_NAME(name); \
+ ALIGN; \
+ SYMBOL_NAME_LABEL(name)
+#endif
+
+
+#if CONFIG_NMI_WATCHDOG
+#define IRQ_ENTRY_ERRORCODE(name) \
+ .globl SYMBOL_NAME(name); \
+ ALIGN; \
+ SYMBOL_NAME_LABEL(name) \
+ pushl %ebp; \
+ movl 4(%esp), %ebp; \
+ xchgl %ebp, (%esp); \
+ xchgl %ebp, 4(%esp); \
+ movl %esp, %ebp; \
+ leal 4(%ebp), %ebp;
+#else
+#define IRQ_ENTRY_ERRORCODE(name) IRQ_ENTRY(name)
+#endif

#define SAVE_ALL \
cld; \
@@ -94,6 +136,21 @@
mov %dx,%ds; \
mov %dx,%es;

+#if CONFIG_NMI_WATCHDOG
+#define RESTORE_ALL \
+ popl %ebx; \
+ popl %ecx; \
+ popl %edx; \
+ popl %esi; \
+ popl %edi; \
+ popl %ebp; \
+ popl %eax; \
+ pop %ds; \
+ pop %es; \
+ addl $4,%esp; \
+ popl %ebp; \
+ iret
+#else
#define RESTORE_ALL \
popl %ebx; \
popl %ecx; \
@@ -106,13 +163,21 @@
pop %es; \
addl $4,%esp; \
iret
+#endif

#define GET_CURRENT(reg) \
movl %esp, reg; \
andl $-8192, reg;

ENTRY(lcall7)
+grr1:
+ cli
+ jmp grr1
pushfl # We get a different stack layout with call gates,
+#if CONFIG_NMI_WATCHDOG
+ pushl %ebp;
+ movl %esp, %ebp;
+#endif
pushl %eax # which has to be cleaned up later..
SAVE_ALL
movl EIP(%esp),%eax # due to call gates, this is eflags, not eip..
@@ -147,7 +212,7 @@
* less clear than it otherwise should be.
*/

-ENTRY(system_call)
+IRQ_ENTRY(system_call)
pushl %eax # save orig_eax
SAVE_ALL
GET_CURRENT(%ebx)
@@ -226,7 +294,7 @@
jmp SYMBOL_NAME(schedule) # test


-ENTRY(divide_error)
+IRQ_ENTRY(divide_error)
pushl $0 # no error code
pushl $ SYMBOL_NAME(do_divide_error)
ALIGN
@@ -257,12 +325,12 @@
addl $8,%esp
jmp ret_from_exception

-ENTRY(coprocessor_error)
+IRQ_ENTRY(coprocessor_error)
pushl $0
pushl $ SYMBOL_NAME(do_coprocessor_error)
jmp error_code

-ENTRY(device_not_available)
+IRQ_ENTRY(device_not_available)
pushl $-1 # mark this as an int
SAVE_ALL
GET_CURRENT(%ebx)
@@ -275,75 +343,75 @@
addl $4,%esp
ret

-ENTRY(debug)
+IRQ_ENTRY(debug)
pushl $0
pushl $ SYMBOL_NAME(do_debug)
jmp error_code

-ENTRY(nmi)
+IRQ_ENTRY(nmi)
pushl $0
pushl $ SYMBOL_NAME(do_nmi)
jmp error_code

-ENTRY(int3)
+IRQ_ENTRY(int3)
pushl $0
pushl $ SYMBOL_NAME(do_int3)
jmp error_code

-ENTRY(overflow)
+IRQ_ENTRY(overflow)
pushl $0
pushl $ SYMBOL_NAME(do_overflow)
jmp error_code

-ENTRY(bounds)
+IRQ_ENTRY(bounds)
pushl $0
pushl $ SYMBOL_NAME(do_bounds)
jmp error_code

-ENTRY(invalid_op)
+IRQ_ENTRY(invalid_op)
pushl $0
pushl $ SYMBOL_NAME(do_invalid_op)
jmp error_code

-ENTRY(coprocessor_segment_overrun)
+IRQ_ENTRY(coprocessor_segment_overrun)
pushl $0
pushl $ SYMBOL_NAME(do_coprocessor_segment_overrun)
jmp error_code

-ENTRY(reserved)
+IRQ_ENTRY(reserved)
pushl $0
pushl $ SYMBOL_NAME(do_reserved)
jmp error_code

-ENTRY(double_fault)
+IRQ_ENTRY_ERRORCODE(double_fault)
pushl $ SYMBOL_NAME(do_double_fault)
jmp error_code

-ENTRY(invalid_TSS)
+IRQ_ENTRY_ERRORCODE(invalid_TSS)
pushl $ SYMBOL_NAME(do_invalid_TSS)
jmp error_code

-ENTRY(segment_not_present)
+IRQ_ENTRY_ERRORCODE(segment_not_present)
pushl $ SYMBOL_NAME(do_segment_not_present)
jmp error_code

-ENTRY(stack_segment)
+IRQ_ENTRY_ERRORCODE(stack_segment)
pushl $ SYMBOL_NAME(do_stack_segment)
jmp error_code

-ENTRY(general_protection)
+IRQ_ENTRY_ERRORCODE(general_protection)
pushl $ SYMBOL_NAME(do_general_protection)
jmp error_code

-ENTRY(alignment_check)
+IRQ_ENTRY_ERRORCODE(alignment_check)
pushl $ SYMBOL_NAME(do_alignment_check)
jmp error_code

-ENTRY(page_fault)
+IRQ_ENTRY_ERRORCODE(page_fault)
pushl $ SYMBOL_NAME(do_page_fault)
jmp error_code

-ENTRY(spurious_interrupt_bug)
+IRQ_ENTRY(spurious_interrupt_bug)
pushl $0
pushl $ SYMBOL_NAME(do_spurious_interrupt_bug)
jmp error_code
--- linux/arch/i386/kernel/traps.c.orig Fri Feb 27 05:15:37 1998
+++ linux/arch/i386/kernel/traps.c Fri Feb 27 05:39:29 1998
@@ -2,6 +2,8 @@
* linux/arch/i386/traps.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1998, Ingo Molnar, added NMI-Watchdog driver
*/

/*
@@ -21,6 +23,7 @@
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/delay.h>
+#include <linux/kernel_stat.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -115,6 +118,7 @@
#define VMALLOC_OFFSET (8*1024*1024)
#define MODULE_RANGE (8*1024*1024)

+#ifndef CONFIG_NMI_WATCHDOG
static void show_registers(struct pt_regs *regs)
{
int i;
@@ -179,7 +183,65 @@
printk("\n");
}

-spinlock_t die_lock;
+#else
+
+/*
+ * This version of show_registers() prints out the exact Call Trace, no
+ * guessing done.
+ * we guarantee that all files are compiled with fno-omit-frame-pointer.
+ */
+static void show_registers (struct pt_regs * regs)
+{
+ int i=1;
+ unsigned long *this_stack, *prev_stack, prev_addr, *prev_bp, framesize;
+
+ printk("Call Trace: ");
+
+ /*
+ * the stack layout: /----- *this_stack
+ * V
+ * [this_frame][prev_bp][prev_addr][prev_frame][...]
+ */
+
+ /*
+ * we are relying on linear mapping on i386
+ */
+
+ __asm__ volatile ("movl %%ebp, %0":"=g"(this_stack));
+ framesize=0;
+
+ while (((long) this_stack & 8191) != 0) {
+ prev_addr = *(this_stack+1);
+
+ if (i && ((i % 8) == 0))
+ printk("\n ");
+ printk("[<%08lx>(%lu)] ", prev_addr, framesize);
+ i++;
+
+ prev_bp = (unsigned long *)(*this_stack);
+ prev_stack = this_stack;
+ this_stack = prev_bp;
+
+ if (
+ ((unsigned long)this_stack < PAGE_OFFSET)
+ || (i>100) ) {
+
+ if ((unsigned long)this_stack < PAGE_OFFSET)
+ break;
+
+ printk("WARNING: something fishy with the stack frame?\n");
+ printk("this_stack: [<%08lx>]\n",
+ (unsigned long)this_stack);
+ printk("i: %d.\n", i);
+ break;
+ }
+ framesize = (unsigned long)this_stack-(unsigned long)prev_stack;
+ }
+ printk("\n<E>\n");
+}
+#endif
+
+static spinlock_t die_lock = SPIN_LOCK_UNLOCKED;

void die_if_kernel(const char * str, struct pt_regs * regs, long err)
{
@@ -189,7 +251,11 @@
spin_lock_irq(&die_lock);
printk("%s: %04lx\n", str, err & 0xffff);
show_registers(regs);
- spin_unlock_irq(&die_lock);
+/* HACK ON */
+ spin_unlock(&die_lock);
+ unlock_kernel();
+ for (;;) __cli();
+/* HACK OFF */
do_exit(SIGSEGV);
}

@@ -237,12 +303,15 @@
unlock_kernel();
}

+#ifndef CONFIG_NMI_WATCHDOG
static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
printk("You probably have a hardware problem with your RAM chips\n");
-}
+}
+#endif

+#ifndef CONFIG_NMI_WATCHDOG
static void io_check_error(unsigned char reason, struct pt_regs * regs)
{
unsigned long i;
@@ -258,14 +327,18 @@
reason &= ~8;
outb(reason, 0x61);
}
+#endif

+#ifndef CONFIG_NMI_WATCHDOG
static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
{
printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
printk("Dazed and confused, but trying to continue\n");
printk("Do you have a strange power saving mode enabled?\n");
}
+#endif

+#ifndef CONFIG_NMI_WATCHDOG
asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
{
unsigned char reason = inb(0x61);
@@ -279,6 +352,76 @@
if (!(reason & 0xc0))
unknown_nmi_error(reason, regs);
}
+#else
+
+/*
+ * FIXME: we assume here that the NMI came from the IO-APIC. It's a quite safe
+ * assumption in most cases, but if anyone knows a way to distinguish between
+ * NMI reasons, please speak up ... [i doubt that the IO-APIC does IO port 0x61
+ * correctly]
+ */
+
+extern atomic_t apic_timer_irqs [NR_CPUS];
+extern spinlock_t console_lock;
+static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+ /*
+ * the best way to detect wether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are broadcasted to every CPU, here
+ * we only have to check the current processor.
+ *
+ * since NMIs dont listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up console_lock first ...
+ * [when there will be more tty-related locks, break them up
+ * here too!]
+ */
+
+ static atomic_t lockup_detected = ATOMIC_INIT(0);
+ static atomic_t last_irq_sums [NR_CPUS] = { ATOMIC_INIT(0), };
+ static atomic_t alert_counter [NR_CPUS] = { ATOMIC_INIT(0), };
+
+ /*
+ * Since current-> is always on the stack, and we always switch
+ * the stack NMI-atomically, it's safe to use smp_processor_id().
+ */
+ int sum, cpu = smp_processor_id();
+
+ sum = atomic_read(apic_timer_irqs+cpu);
+
+ if (atomic_read(&lockup_detected)) {
+ spin_lock(&nmi_print_lock);
+ printk("followup-LOCKUP on CPU%d, forcing oops\n", cpu);
+ show_registers(0);
+ spin_unlock(&nmi_print_lock);
+ for (;;) __cli();
+ }
+
+ if (atomic_read(last_irq_sums+cpu) == sum) {
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ atomic_inc(alert_counter+cpu);
+ if (atomic_read(alert_counter+cpu) == 5*HZ) {
+ spin_lock(&nmi_print_lock);
+ atomic_set(&lockup_detected,1);
+ printk("NMI Watchdog detected LOCKUP on CPU%d, forcing oops\n", cpu);
+ show_registers(0);
+ spin_unlock(&nmi_print_lock);
+ for (;;) __cli();
+ }
+ } else {
+ atomic_set(last_irq_sums+cpu,sum);
+ atomic_set(alert_counter+cpu,0);
+ }
+}
+#endif

asmlinkage void do_debug(struct pt_regs * regs, long error_code)
{
--- linux/arch/i386/kernel/smp.c.orig Fri Feb 27 05:15:37 1998
+++ linux/arch/i386/kernel/smp.c Fri Feb 27 05:39:30 1998
@@ -1420,8 +1420,18 @@
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
+#ifdef CONFIG_NMI_WATCHDOG
+atomic_t apic_timer_irqs [NR_CPUS] = { ATOMIC_INIT(0), };
+#endif
void smp_apic_timer_interrupt(struct pt_regs * regs)
{
+#ifdef CONFIG_NMI_WATCHDOG
+ /*
+ * the only thing that can lock an NMI is an unACK-ed APIC ...
+ */
+ atomic_inc(apic_timer_irqs+smp_processor_id());
+#endif
+
/*
* NOTE! We'd better ACK the irq immediately,
* because timer handling can be slow, and we
@@ -1694,7 +1704,9 @@
/*
* We ACK the APIC, just in case there is something pending.
*/
+
ack_APIC_irq ();
+

restore_flags(flags);
}
--- linux/arch/i386/kernel/irq.h.orig Wed Feb 18 10:17:02 1998
+++ linux/arch/i386/kernel/irq.h Fri Feb 27 06:06:55 1998
@@ -1,6 +1,8 @@
#ifndef __irq_h
#define __irq_h

+#include <linux/config.h>
+
/*
* Various low-level irq details needed by irq.c and smp.c
*
@@ -87,6 +89,14 @@
"mov %dx,%ds\n\t" \
"mov %dx,%es\n\t"

+#if CONFIG_NMI_WATCHDOG
+#define INSERT_FRAME \
+ "pushl %ebp\n\t" \
+ "movl %esp, %ebp\n\t"
+#else
+#define INSERT_FRAME
+#endif
+
#define IRQ_NAME2(nr) nr##_interrupt(void)
#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)

@@ -105,6 +115,7 @@
__asm__( \
"\n"__ALIGN_STR"\n" \
SYMBOL_NAME_STR(x) ":\n\t" \
+ INSERT_FRAME \
"pushl $-1\n\t" \
SAVE_ALL \
"call "SYMBOL_NAME_STR(smp_##x)"\n\t" \
@@ -115,6 +126,7 @@
__asm__( \
"\n"__ALIGN_STR"\n" \
SYMBOL_NAME_STR(x) ":\n\t" \
+ INSERT_FRAME \
"pushl $-1\n\t" \
SAVE_ALL \
"movl %esp,%eax\n\t" \
@@ -138,6 +150,7 @@
__asm__( \
"\n"__ALIGN_STR"\n" \
SYMBOL_NAME_STR(IRQ) #nr "_interrupt:\n\t" \
+ INSERT_FRAME \
"pushl $"#nr"-256\n\t" \
"jmp common_interrupt");

--- linux/arch/i386/kernel/io_apic.c.orig Wed Feb 18 10:17:02 1998
+++ linux/arch/i386/kernel/io_apic.c Fri Feb 27 05:39:30 1998
@@ -37,6 +37,17 @@
volatile unsigned int * io_apic_reg = NULL;

/*
+ * We want to avoid #ifdef CONFIG_'s in the main code whenever possible:
+ */
+#ifdef CONFIG_NMI_WATCHDOG
+ int nmi_pin = -1;
+ const int nmi_irq = CONFIG_NMI_WATCHDOG_IRQ;
+#else
+ int nmi_pin = 0;
+ const int nmi_irq = -1;
+#endif
+
+/*
* The structure of the IO-APIC:
*/
struct IO_APIC_reg_00 {
@@ -62,6 +73,7 @@
__u32 vector : 8,
delivery_mode : 3, /* 000: FIXED
* 001: lowest prio
+ * 100: NMI
* 111: ExtInt
*/
dest_mode : 1, /* 0: physical, 1: logical */
@@ -273,6 +285,19 @@

entry.vector = IO_APIC_GATE_OFFSET + (irq<<3);

+ if (mp_irqs[i].mpc_irqtype)
+ continue;
+
+ if (irq == nmi_irq) {
+ entry.delivery_mode = 4; /* broadcast NMI */
+ make_8259A_irq(irq);
+ /*
+ * Remember which register has the NMI IRQ entry,
+ * so we can turn it off in case there is some
+ * incompatibility
+ */
+ nmi_pin = i;
+ }
/*
* Determine IRQ line polarity (high active or low active):
*/
@@ -629,16 +654,23 @@
init_IO_APIC_traps();
setup_IO_APIC_irqs ();

+ if (nmi_pin == -1)
+ printk(".. NMI watchdog has invalid source IRQ.\n");
+ else if (nmi_irq != -1)
+ printk("NMI Watchdog activated on source IRQ %d\n", nmi_irq);
+
if (!timer_irq_works ()) {
make_8259A_irq(0);
if (!timer_irq_works ())
panic("IO-APIC + timer doesnt work!");
printk("..MP-BIOS bug: i8254 timer not connected to IO-APIC\n");
printk("..falling back to 8259A-based timer interrupt\n");
+ if ((nmi_pin != -1) && (nmi_irq == 0))
+ printk(".. NMI Watchdog disabled as source IRQ is timer!\n");
}
-
- printk("nr of MP irq sources: %d.\n", mp_irq_entries);
- printk("nr of IOAPIC registers: %d.\n", nr_ioapic_registers);
+
+ printk("nr of MP irq sources: %d.\n", mp_irq_entries);
+ printk("nr of IOAPIC registers: %d.\n", nr_ioapic_registers);
print_IO_APIC();
}

--- linux/Makefile.orig Fri Feb 27 05:15:36 1998
+++ linux/Makefile Fri Feb 27 05:39:29 1998
@@ -24,7 +24,12 @@
FINDHPATH = $(HPATH)/asm $(HPATH)/linux $(HPATH)/scsi $(HPATH)/net

HOSTCC =gcc
-HOSTCFLAGS =-O2 -fomit-frame-pointer
+
+ifeq ($(CONFIG_NMI_WATCHDOG),y)
+ HOSTCFLAGS =-O2 -fno-omit-frame-pointer
+else
+ HOSTCFLAGS =-O2 -fomit-frame-pointer
+endif

CROSS_COMPILE =

@@ -85,7 +90,11 @@
# standard CFLAGS
#

-CFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer
+ifeq ($(CONFIG_NMI_WATCHDOG),y)
+ CFLAGS = -Wall -Wstrict-prototypes -O2 -fno-omit-frame-pointer
+else
+ CFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer
+endif

ifdef CONFIG_CPP
CFLAGS := $(CFLAGS) -x c++

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu