Re: [BUG] 2.6.25-rc2-git4 - Regression Kernel oops while runningkernbench and tbench on powerpc

From: Paul Mackerras
Date: Wed Apr 23 2008 - 04:17:08 EST


Kamalesh Babulal writes:

> After applying the patch above and the patch posted on
> http://lkml.org/lkml/2008/4/8/42
> the bug had the following information,

Thanks. The patch below, against Linus' current git tree, fixes one
bug that might be the cause of the problem, and also attempts to
detect the erroneous situation earlier and fix it up, and also print
some debug information. Please try to reproduce the problem with this
patch applied, and if there are any console log messages starting with
SLB: or FWNMI:, please send me the console log.

Paul.

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index c0db5b7..f7f0962 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -439,6 +439,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
mr r1,r8 /* start using new stack pointer */
std r7,PACAKSAVE(r13)

+ /* check that SLB entry 2 contains the right thing */
+ clrrdi r6,r1,28
+ clrldi. r0,r6,2
+ beq 3f
+ li r0,2
+ slbmfee r7,r0
+ oris r6,r6,SLB_ESID_V@h
+ cmpd r6,r7
+ beq 3f
+ bl bad_slb_switch
+ ld r3,PACACURRENT(r13)
+ addi r3,r3,THREAD
+3:
ld r6,_CCR(r1)
mtcrf 0xFF,r6

@@ -540,6 +553,19 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
ld r4,_XER(r1)
mtspr SPRN_XER,r4

+ /* check that SLB entry 2 contains the right thing */
+ clrrdi r6,r1,28 /* stack ESID */
+ clrldi. r0,r6,2
+ beq 57f
+ li r0,2
+ slbmfee r7,r0
+ oris r6,r6,SLB_ESID_V@h
+ cmpd r6,r7
+ beq 57f
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl bad_slb_exc
+ ld r3,_MSR(r1)
+57:
REST_8GPRS(5, r1)

andi. r0,r3,MSR_RI
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index be35ffa..c938134 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -45,6 +45,7 @@
#include <asm/system.h>
#include <asm/mpic.h>
#include <asm/vdso_datapage.h>
+#include <asm/mmu.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
#endif
@@ -580,6 +581,10 @@ int __devinit start_secondary(void *unused)
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;

+ /* Bolt in the entry for the kernel stack now */
+ if (cpu_has_feature(CPU_FTR_SLB))
+ slb_flush_and_rebolt();
+
smp_store_cpu_info(cpu);
set_dec(tb_ticks_per_jiffy);
preempt_disable();
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 906daed..bb7765b 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -309,3 +309,34 @@ void slb_initialize(void)
* one. */
asm volatile("isync":::"memory");
}
+
+static void dump_slb(void)
+{
+ long entry;
+ unsigned long esid, vsid;
+
+ printk(KERN_EMERG "SLB contents now:\n");
+ for (entry = 0; entry < 64; ++entry) {
+ asm volatile("slbmfee %0,%1" : "=r" (esid) : "r" (entry));
+ if (esid == 0)
+ /* valid bit is clear along with everything else */
+ continue;
+ asm volatile("slbmfev %0,%1" : "=r" (vsid) : "r" (entry));
+ printk(KERN_EMERG "%d: %.16lx %.16lx\n", entry, esid, vsid);
+ }
+}
+
+void bad_slb_exc(struct pt_regs *regs)
+{
+ printk(KERN_EMERG "SLB: stack not bolted on exception return\n");
+ dump_slb();
+ slb_flush_and_rebolt();
+ show_regs(regs);
+}
+
+void bad_slb_switch(void)
+{
+ printk(KERN_EMERG "SLB: stack not bolted on context switch\n");
+ dump_slb();
+ slb_flush_and_rebolt();
+}
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index a1ab25c..ed68083 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -325,6 +325,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)

if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
/* Platform corrected itself */
+ printk(KERN_ALERT "FWNMI: platform corrected error %.16lx\n",
+ *(unsigned long *)err);
nonfatal = 1;
} else if ((regs->msr & MSR_RI) &&
user_mode(regs) &&
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/