Re: [boot crash] Re: [tip:x86/mce3] x86, mce: use 64bit machine check code on 32bit

From: Andi Kleen
Date: Mon Aug 17 2009 - 07:26:09 EST


Ingo Molnar <mingo@xxxxxxx> writes:

Weird the original mail didn't make it through, only saw the replies.

>> all quirks.
>
> This commit causes a new regression, it broke the bootup on one of
> my -tip testsystems, an older, Pentium-M based HP laptop (HP
> OmniBook 6000 EA).
>
> The symptom is that the bootup hard-hangs after MCE init:
>
> [ 0.022996] Mount-cache hash table entries: 512
> [ 0.024996] Initializing cgroup subsys debug
> [ 0.025996] Initializing cgroup subsys cpuacct
> [ 0.026995] Initializing cgroup subsys devices
> [ 0.027995] Initializing cgroup subsys freezer
> [ 0.028995] mce: CPU supports 5 MCE banks

Thanks for testing.

I assume the system boots with CONFIG_X86_NEW_MCE disabled and machine checks
enabled, correct? As in you never booted with mce=off or a similar option
on older kernels.

First please test with the patch I posted in

http://article.gmane.org/gmane.linux.kernel/875563

I don't see that one in tip.

If that doesn't help please boot with the appended debug patch and post the console
log again, then we will hopefully see where it hangs.

-Andi

commit 09f099eafbff70ecf55f7f111d2fb497ddb9a915
Author: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Date: Mon Aug 17 13:15:50 2009 +0200

Debug patch: trace mce init

Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1cfb623..bfaed40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -45,6 +45,8 @@

#include "mce-internal.h"

+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
@@ -1196,6 +1198,8 @@ static int mce_cap_init(void)
if (cap & MCG_SER_P)
mce_ser = 1;

+ D;
+
return 0;
}

@@ -1209,20 +1213,30 @@ static void mce_init(void)
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
+ D;
machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);

+ D;
+
set_in_cr4(X86_CR4_MCE);

+ D;
+
rdmsrl(MSR_IA32_MCG_CAP, cap);
+ D;
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+ D;

for (i = 0; i < banks; i++) {
if (skip_bank_init(i))
continue;
+ printk("init bank %d\n", i);
wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
+
+ D;
}

/* Add per CPU specific workarounds here */
@@ -1319,9 +1333,12 @@ static void mce_init_timer(void)
*n = check_interval * HZ;
if (!*n)
return;
+
+ D;
setup_timer(t, mcheck_timer, smp_processor_id());
t->expires = round_jiffies(jiffies + *n);
add_timer_on(t, smp_processor_id());
+ D;
}

/*
@@ -1340,15 +1357,21 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)

if (mce_cap_init() < 0) {
mce_disabled = 1;
+ D;
return;
}
+ D;
mce_cpu_quirks(c);
+ D;

machine_check_vector = do_machine_check;

mce_init();
+ D;
mce_cpu_features(c);
+ D;
mce_init_timer();
+ D;
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
}

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0..0d6aeab 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -13,6 +13,8 @@
#include <asm/msr.h>
#include <asm/mce.h>

+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -207,6 +209,8 @@ static void intel_init_cmci(void)
if (!cmci_supported(&banks))
return;

+ D;
+
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks, 1);
/*
@@ -217,10 +221,15 @@ static void intel_init_cmci(void)
*/
apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
cmci_recheck();
+
+ D;
}

void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
+ D;
intel_init_thermal(c);
+ D;
intel_init_cmci();
+ D;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd1..b4c6ca0 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -31,6 +31,8 @@
#include <asm/mce.h>
#include <asm/msr.h>

+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)

@@ -236,10 +238,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
int tm2 = 0;
u32 l, h;

+ D;
+
/* Thermal monitoring depends on ACPI and clock modulation*/
if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
return;

+ D;
+
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
@@ -253,6 +259,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}

+ D;
+
+
if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
tm2 = 1;

@@ -264,6 +273,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}

+ D;
+
/* We'll mask the thermal vector in the lapic till we're ready: */
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);
@@ -286,4 +297,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)

/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
+
+ D;
}


--
ak@xxxxxxxxxxxxxxx -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/