[Patch V2 2/3] x86, mce: Add infrastructure required to support LMCE

From: Ashok Raj
Date: Tue Jun 02 2015 - 13:50:39 EST


Initialization and handling for LMCE
- boot time option to disable LMCE for that boot instance
- Check for capability via IA32_MCG_CAP

Incorporated feedback from Boris from V1

See http://www.intel.com/sdm Volume 3 System Programming Guide, Chapter 15
for more information on MSR's and documentation on Local MCE.

Signed-off-by: Ashok Raj <ashok.raj@xxxxxxxxx>
---
Documentation/x86/x86_64/boot-options.txt | 3 ++
arch/x86/include/asm/mce.h | 5 +++
arch/x86/kernel/cpu/mcheck/mce.c | 3 ++
arch/x86/kernel/cpu/mcheck/mce_intel.c | 59 +++++++++++++++++++++++++++++++
4 files changed, 70 insertions(+)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 5223479..79edee0 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -31,6 +31,9 @@ Machine check
(e.g. BIOS or hardware monitoring applications), conflicting
with OS's error handling, and you cannot deactivate the agent,
then this option will be a help.
+ mce=no_lmce
+ Do not opt-in to Local MCE delivery. Use legacy method
+ to broadcast MCE's.
mce=bootlog
Enable logging of machine checks left over from booting.
Disabled by default on AMD because some BIOS leave bogus ones.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 677a408..8ba4d7a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -109,6 +109,7 @@ struct mce_log {
struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
+ bool lmce_disabled;
bool ignore_ce;
bool disabled;
bool ser;
@@ -173,12 +174,16 @@ void cmci_clear(void);
void cmci_reenable(void);
void cmci_rediscover(void);
void cmci_recheck(void);
+void lmce_clear(void);
+void lmce_enable(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
static inline void cmci_rediscover(void) {}
static inline void cmci_recheck(void) {}
+static inline void lmce_clear(void) {}
+static inline void lmce_enable(void) {}
#endif

#ifdef CONFIG_X86_MCE_AMD
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e535533..d10aada 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1976,6 +1976,7 @@ void mce_disable_bank(int bank)
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
@@ -1999,6 +2000,8 @@ static int __init mcheck_enable(char *str)
cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b4a41cf..7d500b6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -91,6 +91,37 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}

+static bool lmce_supported(void)
+{
+ u64 cap, feature_control;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ /*
+ * LMCE depends on recovery support in the processor.
+ * Hence both MCG_SER_P and MCG_LMCE_P should be present in
+ * MCG_CAP
+ */
+ if (!((cap & (MCG_SER_P | MCG_LMCE_P)) == (MCG_SER_P | MCG_LMCE_P)))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting
+ * bit20 in IA32_FEATURE_CONTROL. without which touching
+ * MCG_EXT_CTL will generate #GP fault.
+ */
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, feature_control);
+ if (((feature_control & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_LMCE)) == (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_LMCE)))
+ return true;
+ else
+ return false;
+
+}
+
bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
@@ -405,6 +436,34 @@ static void intel_init_cmci(void)
cmci_recheck();
}

+void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ val |= MCG_EXT_CTL_LMCE_EN;
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+/*
+ * Disable LMCE on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void lmce_clear(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/