[PATCH v3 2/3] x86/MCE/AMD: Don't report L1 BTB MCA errors on some Family 17h models

From: Ghannam, Yazen
Date: Fri Mar 22 2019 - 16:30:45 EST


From: Yazen Ghannam <yazen.ghannam@xxxxxxx>

AMD Family 17h Models 10h-2Fh may report a high number of L1 BTB MCA
errors under certain conditions. The errors are benign and can safely be
ignored. However, the high error rate may cause the MCA threshold
counter to overflow causing a high rate of thresholding interrupts. In
addition, users may see the errors reported through the AMD MCE decoder
module, even with the interrupt disabled, due to MCA polling.

This error is reported through the Instruction Fetch bank.

Clear the "Counter Present" bit in the Instruction Fetch bank's
MCA_MISC0 register. This will prevent enabling MCA thresholding on this
bank which will prevent the high interrupt rate due to this error.

Define an AMD-specific function to filter these errors from the MCE
event pool.

Cc: <stable@xxxxxxxxxxxxxxx> # 4.14.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
Cc: <stable@xxxxxxxxxxxxxxx> # 4.14.x: 30aa3d26edb0: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
Cc: <stable@xxxxxxxxxxxxxxx> # 4.14.x
Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
---
Link:
https://lkml.kernel.org/r/20190321202505.5553-2-Yazen.Ghannam@xxxxxxx

v2->v3:
* Define a simple AMD-specific filter function rather than a
model-specific one.

v1->v2:
* Filter out the error earlier in MCE code rather than later in EDAC.

arch/x86/include/asm/mce.h | 2 ++
arch/x86/kernel/cpu/mce/amd.c | 54 ++++++++++++++++++++++++++--------
arch/x86/kernel/cpu/mce/core.c | 3 ++
3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ec5bf1cad217..50f76a7956cc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -344,6 +344,7 @@ extern struct smca_bank smca_banks[MAX_NR_BANKS];

extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
+extern bool filter_mce_amd(struct mce *m);

extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
@@ -353,6 +354,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
+static inline bool filter_mce_amd(struct mce *m) { return false; };

#endif

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e64de5149e50..819d5dd41925 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -563,22 +563,52 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
return offset;
}

+bool filter_mce_amd(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u8 xec = (m->status >> 16) & 0x3F;
+
+ /*
+ * Spurious errors of this type may be reported.
+ * See Family 17h Models 10h-2Fh Erratum #1114.
+ */
+ if (c->x86 == 0x17 &&
+ c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
+ bank_type == SMCA_IF && xec == 10)
+ return true;
+
+ return false;
+}
+
/*
- * Turn off MC4_MISC thresholding banks on all family 0x15 models since
- * they're not supported there.
+ * Turn off thresholding banks for the following conditions:
+ * - MC4_MISC thresholding is not support on Family 0x15.
+ * - Prevent possible spurious interrupts from the IF bank on Family 0x17
+ * Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c)
+void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
- int i;
+ int i, num_msrs;
u64 hwcr;
bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
+ u32 msrs[NR_BLOCKS];
+
+ if (c->x86 == 0x15 && bank == 4) {
+ msrs[0] = 0x00000413; /* MC4_MISC0 */
+ msrs[1] = 0xc0000408; /* MC4_MISC1 */
+ num_msrs = 2;
+ } else if (c->x86 == 0x17 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
+
+ if (smca_get_bank_type(bank) != SMCA_IF)
+ return;

- if (c->x86 != 0x15)
+ msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
+ num_msrs = 1;
+ } else {
return;
+ }

rdmsrl(MSR_K7_HWCR, hwcr);

@@ -589,7 +619,7 @@ void disable_err_thresholding(struct cpuinfo_x86 *c)
wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));

/* Clear CntP bit safely */
- for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ for (i = 0; i < num_msrs; i++)
msr_clear_bit(msrs[i], 62);

/* restore old settings */
@@ -604,12 +634,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;

- disable_err_thresholding(c);
-
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);

+ disable_err_thresholding(c, bank);
+
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block);
if (!address)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 12d61b8f8154..ebc619a20c87 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1773,6 +1773,9 @@ static void __mcheck_cpu_init_timer(void)

bool filter_mce(struct mce *m)
{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return filter_mce_amd(m);
+
return false;
}

--
2.17.1