[BUG] 2.5.27-bk deadlocks every other boot

From: William Lee Irwin III (wli@holomorphy.com)
Date: Mon Jul 22 2002 - 23:56:08 EST


It looks like a fresh bug was recently introduced. Every other boot
my NUMA-Q testboxen deadlock just after a serial console printk() about
speed change or something on that order. It occurs nondeterministically
after userspace has run for a short while, during init scripts.

My current set of NUMA-Q workarounds follows as a combined diff (yes,
Martin, I know they're not what you'd like to merge but they should be
equivalent in functionality so far as fixing bootup problems goes).

rmk, I've cc:'d you in the hopes that some unmerged serial fix will
magically materialize.

Cheers,
Bill

===== arch/i386/config.in 1.42 vs edited =====
--- 1.42/arch/i386/config.in Fri Jul 19 16:00:55 2002
+++ edited/arch/i386/config.in Mon Jul 22 22:44:41 2002
@@ -165,7 +165,9 @@
       define_bool CONFIG_X86_IO_APIC y
    fi
 else
- bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+ if [ "$CONFIG_PREEMPT" != "y" ]; then
+ bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+ fi
 fi
 
 bool 'Machine Check Exception' CONFIG_X86_MCE
===== arch/i386/vmlinux.lds 1.9 vs edited =====
--- 1.9/arch/i386/vmlinux.lds Sun May 19 12:03:14 2002
+++ edited/arch/i386/vmlinux.lds Mon Jul 22 22:49:39 2002
@@ -84,11 +84,13 @@
   _end = . ;
 
   /* Sections to be discarded */
+ /*
   /DISCARD/ : {
         *(.text.exit)
         *(.data.exit)
         *(.exitcall.exit)
         }
+ */
 
   /* Stabs debugging sections. */
   .stab 0 : { *(.stab) }
===== arch/i386/kernel/io_apic.c 1.23 vs edited =====
--- 1.23/arch/i386/kernel/io_apic.c Sun Jul 21 09:09:17 2002
+++ edited/arch/i386/kernel/io_apic.c Mon Jul 22 22:44:41 2002
@@ -219,7 +219,7 @@
 #define IRQ_ALLOWED(cpu,allowed_mask) \
                 ((1 << cpu) & (allowed_mask))
 
-#if CONFIG_SMP
+#if CONFIG_SMP && !CONFIG_MULTIQUAD
 static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
 {
         int search_idle = 1;
===== arch/i386/kernel/smp.c 1.18 vs edited =====
--- 1.18/arch/i386/kernel/smp.c Mon Jul 15 10:03:02 2002
+++ edited/arch/i386/kernel/smp.c Mon Jul 22 22:44:41 2002
@@ -569,7 +569,7 @@
         struct call_data_struct data;
         int cpus = num_online_cpus()-1;
 
- if (!cpus)
+ if (cpus <= 0)
                 return 0;
 
         data.func = func;
===== arch/i386/kernel/cpu/common.c 1.2 vs edited =====
--- 1.2/arch/i386/kernel/cpu/common.c Mon Jul 15 10:03:21 2002
+++ edited/arch/i386/kernel/cpu/common.c Mon Jul 22 22:44:41 2002
@@ -444,6 +444,8 @@
         __asm__ __volatile__("lgdt %0": "=m" (gdt_descr));
         __asm__ __volatile__("lidt %0": "=m" (idt_descr));
 
+ printk(KERN_INFO "Loading GDT/IDT for CPU#%d\n", nr);
+
         /*
          * Delete NT
          */
@@ -464,6 +466,8 @@
         load_TR(nr);
         load_LDT(&init_mm.context);
 
+ printk(KERN_INFO "Loaded per-cpu LDT/TSS for CPU#%d\n", nr);
+
         /* Clear %fs and %gs. */
         asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
 
@@ -481,4 +485,6 @@
         clear_thread_flag(TIF_USEDFPU);
         current->used_math = 0;
         stts();
+
+ printk(KERN_INFO "Cleaned up FPU and debug regs for CPU#%d\n", nr);
 }
===== drivers/net/tulip/de2104x.c 1.5 vs edited =====
--- 1.5/drivers/net/tulip/de2104x.c Thu Mar 7 01:42:39 2002
+++ edited/drivers/net/tulip/de2104x.c Mon Jul 22 22:47:20 2002
@@ -1455,7 +1455,7 @@
         /* Update the error counts. */
         __de_get_stats(de);
 
- synchronize_irq();
+ synchronize_irq(dev->irq);
         de_clean_rings(de);
 
         de_init_hw(de);
===== drivers/scsi/qlogicisp.c 1.11 vs edited =====
--- 1.11/drivers/scsi/qlogicisp.c Sun Jul 21 01:55:49 2002
+++ edited/drivers/scsi/qlogicisp.c Mon Jul 22 22:49:12 2002
@@ -84,14 +84,13 @@
 { \
         unsigned long flags; \
                                                                 \
- save_flags(flags); \
- cli(); \
+ local_irq_save(flags); \
         trace.buf[trace.next].name = (w); \
         trace.buf[trace.next].time = jiffies; \
         trace.buf[trace.next].index = (i); \
         trace.buf[trace.next].addr = (long) (a); \
         trace.next = (trace.next + 1) & (TRACE_BUF_LEN - 1); \
- restore_flags(flags); \
+ local_irq_restore(flags); \
 }
 
 #else
@@ -1704,8 +1703,7 @@
 
         ENTER("isp1020_load_parameters");
 
- save_flags(flags);
- cli();
+ local_irq_save(flags);
 
         hwrev = isp_inw(host, ISP_CFG0) & ISP_CFG0_HWMSK;
         isp_cfg1 = ISP_CFG1_F64 | ISP_CFG1_BENAB;
@@ -1724,7 +1722,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set initiator id failure\n");
                 return 1;
         }
@@ -1736,7 +1734,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set retry count failure\n");
                 return 1;
         }
@@ -1747,7 +1745,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : async data setup time failure\n");
                 return 1;
         }
@@ -1759,7 +1757,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set active negation state failure\n");
                 return 1;
         }
@@ -1771,7 +1769,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set pci control parameter failure\n");
                 return 1;
         }
@@ -1782,7 +1780,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set tag age limit failure\n");
                 return 1;
         }
@@ -1793,7 +1791,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set selection timeout failure\n");
                 return 1;
         }
@@ -1812,7 +1810,7 @@
                 isp1020_mbox_command(host, param);
 
                 if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                         printk("qlogicisp : set target parameter failure\n");
                         return 1;
                 }
@@ -1827,7 +1825,7 @@
                         isp1020_mbox_command(host, param);
 
                         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                                 printk("qlogicisp : set device queue "
                                        "parameter failure\n");
                                 return 1;
@@ -1854,7 +1852,7 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set response queue failure\n");
                 return 1;
         }
@@ -1879,12 +1877,12 @@
         isp1020_mbox_command(host, param);
 
         if (param[0] != MBOX_COMMAND_COMPLETE) {
- restore_flags(flags);
+ local_irq_restore(flags);
                 printk("qlogicisp : set request queue failure\n");
                 return 1;
         }
 
- restore_flags(flags);
+ local_irq_restore(flags);
 
         LEAVE("isp1020_load_parameters");
 
===== fs/mpage.c 1.11 vs edited =====
--- 1.11/fs/mpage.c Tue Jul 16 14:47:15 2002
+++ edited/fs/mpage.c Mon Jul 22 22:44:41 2002
@@ -24,7 +24,7 @@
  * The largest-sized BIO which this code will assemble, in bytes. Set this
  * to PAGE_CACHE_SIZE if your drivers are broken.
  */
-#define MPAGE_BIO_MAX_SIZE BIO_MAX_SIZE
+#define MPAGE_BIO_MAX_SIZE PAGE_CACHE_SIZE
 
 /*
  * I/O completion handler for multipage BIOs.
===== include/asm-i386/apicdef.h 1.3 vs edited =====
--- 1.3/include/asm-i386/apicdef.h Wed Mar 27 16:05:30 2002
+++ edited/include/asm-i386/apicdef.h Mon Jul 22 22:44:41 2002
@@ -108,7 +108,11 @@
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 
+#ifndef CONFIG_MULTIQUAD
 #define MAX_IO_APICS 8
+#else
+#define MAX_IO_APICS 1024
+#endif /* CONFIG_MULTIQUAD */
 
 /*
  * the local APIC register structure, memory mapped. Not terribly well
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Tue Jul 23 2002 - 22:00:41 EST