[tip:ras/core] x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors

From: tip-bot for Aravind Gopalakrishnan
Date: Tue Mar 08 2016 - 08:14:35 EST


Commit-ID: be0aec23bf4624fd55650629fe8df20483487049
Gitweb: http://git.kernel.org/tip/be0aec23bf4624fd55650629fe8df20483487049
Author: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@xxxxxxx>
AuthorDate: Mon, 7 Mar 2016 14:02:18 +0100
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Tue, 8 Mar 2016 11:48:14 +0100

x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors

For Scalable MCA enabled processors, errors are listed per IP block. And
since it is not required for an IP to map to a particular bank, we need
to use HWID and McaType values from the MCx_IPID register to figure out
which IP a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register to indicate Task
context is corrupt.

Add logic here to decode errors from all known IP blocks for Fam17h
Model 00-0fh and to print TCC errors.

[ Minor fixups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@xxxxxxx>
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Tony Luck <tony.luck@xxxxxxxxx>
Cc: linux-edac <linux-edac@xxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@xxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
arch/x86/include/asm/mce.h | 59 ++++++
arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 +++
drivers/edac/mce_amd.c | 335 ++++++++++++++++++++++++++++++++++-
3 files changed, 420 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 80ba0a8..9c467fe 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
/* AMD-specific bits */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank.
+ * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ * But should not be used to determine MSR numbers.
+ * - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX 0x1
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF

/*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@

/* AMD Scalable MCA */
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))

/*
* This structure contains all data related to the MCE log. Also
@@ -291,4 +305,49 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);

+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+ SMCA_F17H_CORE = 0, /* Core errors */
+ SMCA_DF, /* Data Fabric */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+ const char *name;
+ unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 cache */
+ SMCA_DE, /* Decoder unit */
+ RES, /* Reserved */
+ SMCA_EX, /* Execution unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 cache */
+ N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+ SMCA_CS = 0, /* Coherent Slave */
+ SMCA_PIE, /* Power management, Interrupts, etc */
+ N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27b..ee487a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
"execution_unit",
};

+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+ [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
+ [SMCA_DF] = { "data_fabric", 0x2E },
+ [SMCA_UMC] = { "umc", 0x96 },
+ [SMCA_PB] = { "param_block", 0x5 },
+ [SMCA_PSP] = { "psp", 0xFF },
+ [SMCA_SMU] = { "smu", 0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+ [SMCA_LS] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [RES] = "",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+ [SMCA_CS] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945c..49768c0 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
"Status Register File",
};

+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "", /* reserved */
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison comsumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
static bool f12h_mc0_mce(u16 ec, u8 xec)
{
bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
}

+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+ unsigned int mca_type)
+{
+ const char * const *error_desc_array;
+ size_t len;
+
+ pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+ switch (mca_type) {
+ case SMCA_LS:
+ error_desc_array = f17h_ls_mce_desc;
+ len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+ if (xec == 0x4) {
+ pr_cont("Unrecognized LS MCA error code.\n");
+ return;
+ }
+ break;
+
+ case SMCA_IF:
+ error_desc_array = f17h_if_mce_desc;
+ len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+ break;
+
+ case SMCA_L2_CACHE:
+ error_desc_array = f17h_l2_mce_desc;
+ len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+ break;
+
+ case SMCA_DE:
+ error_desc_array = f17h_de_mce_desc;
+ len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+ break;
+
+ case SMCA_EX:
+ error_desc_array = f17h_ex_mce_desc;
+ len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+ break;
+
+ case SMCA_FP:
+ error_desc_array = f17h_fp_mce_desc;
+ len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+ break;
+
+ case SMCA_L3_CACHE:
+ error_desc_array = f17h_l3_mce_desc;
+ len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+ break;
+
+ default:
+ pr_cont("Corrupted MCA core error info.\n");
+ return;
+ }
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n",
+ amd_core_mcablock_names[mca_type]);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+ const char * const *error_desc_array;
+ size_t len;
+
+ pr_emerg(HW_ERR "Data Fabric Error: ");
+
+ switch (mca_type) {
+ case SMCA_CS:
+ error_desc_array = f17h_cs_mce_desc;
+ len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+ break;
+
+ case SMCA_PIE:
+ error_desc_array = f17h_pie_mce_desc;
+ len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+ break;
+
+ default:
+ pr_cont("Corrupted MCA Data Fabric info.\n");
+ return;
+ }
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n",
+ amd_df_mcablock_names[mca_type]);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+ u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+ unsigned int hwid, mca_type, i;
+ u8 xec = XEC(m->status, xec_mask);
+ const char * const *error_desc_array;
+ const char *ip_name;
+ u32 low, high;
+ size_t len;
+
+ if (rdmsr_safe(addr, &low, &high)) {
+ pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+ return;
+ }
+
+ hwid = high & MCI_IPID_HWID;
+ mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+ pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+ /*
+ * Based on hwid and mca_type values, decode errors from respective IPs.
+ * Note: mca_type values make sense only in the context of an hwid.
+ */
+ for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+ if (amd_hwids[i].hwid == hwid)
+ break;
+
+ switch (i) {
+ case SMCA_F17H_CORE:
+ ip_name = (mca_type == SMCA_L3_CACHE) ?
+ "L3 Cache" : "F17h Core";
+ return decode_f17h_core_errors(ip_name, xec, mca_type);
+ break;
+
+ case SMCA_DF:
+ return decode_df_errors(xec, mca_type);
+ break;
+
+ case SMCA_UMC:
+ error_desc_array = f17h_umc_mce_desc;
+ len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+ break;
+
+ case SMCA_PB:
+ error_desc_array = f17h_pb_mce_desc;
+ len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+ break;
+
+ case SMCA_PSP:
+ error_desc_array = f17h_psp_mce_desc;
+ len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+ break;
+
+ case SMCA_SMU:
+ error_desc_array = f17h_smu_mce_desc;
+ len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+ break;
+
+ default:
+ pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+ return;
+ }
+
+ ip_name = amd_hwids[i].name;
+ pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
static inline void amd_decode_err_code(u16 ec)
{
if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
struct mce *m = (struct mce *)data;
struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
int ecc;
+ u32 ebx = cpuid_ebx(0x80000007);

if (amd_filter_mce(m))
return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));

- if (c->x86 == 0x15 || c->x86 == 0x16)
+ if (c->x86 >= 0x15)
pr_cont("|%s|%s",
((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));

+ if (!!(ebx & BIT(3))) {
+ u32 low, high;
+ u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+ if (!rdmsr_safe(addr, &low, &high) &&
+ (low & MCI_CONFIG_MCAX))
+ pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+ }
+
/* do the two bits[14:13] together */
ecc = (m->status >> 45) & 0x3;
if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (m->status & MCI_STATUS_ADDRV)
pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);

+ if (!!(ebx & BIT(3))) {
+ decode_smca_errors(m);
+ goto err_code;
+ }
+
if (!fam_ops)
goto err_code;

@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
static int __init mce_amd_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 ebx;

if (c->x86_vendor != X86_VENDOR_AMD)
return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
fam_ops->mc2_mce = f16h_mc2_mce;
break;

+ case 0x17:
+ ebx = cpuid_ebx(0x80000007);
+ xec_mask = 0x3f;
+ if (!(ebx & BIT(3))) {
+ printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+ goto err_out;
+ }
+ break;
+
default:
printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
- kfree(fam_ops);
- fam_ops = NULL;
+ goto err_out;
}

pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
mce_register_decode_chain(&amd_mce_dec_nb);

return 0;
+
+err_out:
+ kfree(fam_ops);
+ fam_ops = NULL;
+ return -EINVAL;
}
early_initcall(mce_amd_init);