[PATCH 02/16] x86, mce: call-in should be after updating global_nwo

From: Hidetoshi Seto
Date: Mon Jun 15 2009 - 04:20:01 EST


At the beginning of Monarch synchronization, processors wait until
all of them have entered the exception handler and then check the
global_nwo to determine if any of them saw a fatal event.

However since current code does call-in before updating global_nwo,
it might happen that the global_nwo does not reflect some of local
nwo at the time. This might break printing corrected errors not
handled yet on panic.

Reported-by: Jin Dongming <jin.dongming@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@xxxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/mcheck/mce.c | 65 ++++++++++++++++++--------------------
1 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6f9db11..84b2630 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -691,18 +691,18 @@ static atomic_t global_nwo;
* in the entry order.
* TBD double check parallel CPU hotunplug
*/
-static int mce_start(int no_way_out, int *order)
+static int mce_start(int *no_way_out)
{
- int nwo;
+ int order;
int cpus = num_online_cpus();
u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;

- if (!timeout) {
- *order = -1;
- return no_way_out;
- }
+ if (!timeout)
+ return -1;

- atomic_add(no_way_out, &global_nwo);
+ atomic_add(*no_way_out, &global_nwo);
+
+ order = atomic_add_return(1, &mce_callin);

/*
* Wait for everyone.
@@ -710,40 +710,38 @@ static int mce_start(int no_way_out, int *order)
while (atomic_read(&mce_callin) != cpus) {
if (mce_timed_out(&timeout)) {
atomic_set(&global_nwo, 0);
- *order = -1;
- return no_way_out;
+ return -1;
}
ndelay(SPINUNIT);
}

- /*
- * Cache the global no_way_out state.
- */
- nwo = atomic_read(&global_nwo);
-
- /*
- * Monarch starts executing now, the others wait.
- */
- if (*order == 1) {
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
atomic_set(&mce_executing, 1);
- return nwo;
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
}

/*
- * Now start the scanning loop one by one
- * in the original callin order.
- * This way when there are any shared banks it will
- * be only seen by one CPU before cleared, avoiding duplicates.
+ * Cache the global no_way_out state.
*/
- while (atomic_read(&mce_executing) < *order) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- *order = -1;
- return no_way_out;
- }
- ndelay(SPINUNIT);
- }
- return nwo;
+ *no_way_out = atomic_read(&global_nwo);
+
+ return order;
}

/*
@@ -887,7 +885,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (!banks)
goto out;

- order = atomic_add_return(1, &mce_callin);
mce_setup(&m);

m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -909,7 +906,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* This way we don't report duplicated events on shared banks
* because the first one to see it will clear it.
*/
- no_way_out = mce_start(no_way_out, &order);
+ order = mce_start(&no_way_out);
for (i = 0; i < banks; i++) {
__clear_bit(i, toclear);
if (!bank[i])
--
1.6.3


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/