On Tue, Feb 16, 2016 at 03:45:08PM -0600, Aravind Gopalakrishnan wrote:
/* AMD-specific bits */\n
#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+/*^^^^^^^^^
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank
+ * - BlkPtr field indicates prescence of extended MISC registers.
Btw, that's MCi_MISC[BlkPtr] ?
Also, please integrate a spell checker into your patch creation
workflow.
+ * But should not be used to determine MSR numbersAll sentences end with a "."
+ * - TCC bit is present in MCx_STATUS
+/*Please stop using gerund, i.e., -ing, in comments and commit messages.
+ * Enumerating new IP types and HWID values
"Enumerate new IP ..." is just fine.
+ * in ScalableMCA enabled AMD processorsAMD-specific so "amd_ip_types"
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum ip_types {
+ F17H_CORE = 0, /* Core errors */What's FUSE subsystem?
+ DF, /* Data Fabric */
+ UMC, /* Unified Memory Controller */
+ FUSE, /* FUSE subsystem */
In any case, this could use a different name in order not to confuse
with Linux's filesystem in userspace.
+amd_hwid and so on. All below should have the "amd_" prefix so that it
+struct hwid {
is obvious.
+ const char *ipname;So all those are defined here but we have a header for exactly that
+ unsigned int hwid_value;
+};
+
+extern struct hwid hwid_mappings[N_IP_TYPES];
+
+enum core_mcatypes {
+ LS = 0, /* Load Store */
+ IF, /* Instruction Fetch */
+ L2_CACHE, /* L2 cache */
+ DE, /* Decoder unit */
+ RES, /* Reserved */
+ EX, /* Execution unit */
+ FP, /* Floating Point */
+ L3_CACHE /* L3 cache */
+};
+
+enum df_mcatypes {
+ CS = 0, /* Coherent Slave */
+ PIE /* Power management, Interrupts, etc */
+};
+#endif
drivers/edac/mce_amd.h. And then you define and export hwid_mappings in
arch/x86/kernel/cpu/mcheck/mce_amd.c to not use it there.
Why isn't all this in drivers/edac/mce_amd.[ch] ?
And if it is there, then you obviously don't need the "amd_" prefix.
+Are those MSRs used in multiple files? If not, -> mce.h.
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5523465..93bccbc 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -266,7 +266,9 @@
/* 'SMCA': AMD64 Scalable MCA */
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+/* Defining HWID to IP type mappings for Scalable MCA */" Define ..."
+ case L3_CACHE:That's a lot of repeated code. You can assign the desc array to a temp
+ if (xec > (ARRAY_SIZE(f17h_l3_mce_desc) - 1))
+ goto wrong_f17hcore_error;
+
+ pr_cont("%s.\n", f17h_l3_mce_desc[xec]);
+ break;
+
+ default:
+ goto wrong_f17hcore_error;
variable depending on mca_type and do the if and pr_cont only once using
that temp variable.
+Ditto.
+ case PIE:
+ if (xec > (ARRAY_SIZE(f17h_pie_mce_desc) - 1))
+ goto wrong_df_error;
+
+ pr_cont("%s.\n", f17h_pie_mce_desc[xec]);
+ break;
+That statement is not very useful in its current form.
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+ u32 low, high;
+ u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+ unsigned int hwid, mca_type, i;
+ u8 xec = XEC(m->status, xec_mask);
+
+ if (rdmsr_safe(addr, &low, &high)) {
+ pr_emerg("Unable to decode errors from banks\n");
+ for (i = 0; i < ARRAY_SIZE(hwid_mappings); i++)So you use those hwid_mappings here. Why aren't they defined here too?
+ case SMU:Also a lot of repeated code which could be simplified.
+ pr_emerg(HW_ERR "SMU Error: ");
+ if (xec > (ARRAY_SIZE(f17h_smu_mce_desc) - 1)) {
+ pr_cont("Unrecognized error code from SMU MCA bank\n");
+ return;
+ }
+ pr_cont("%s.\n", f17h_smu_mce_desc[xec]);
+ break;
+ if (mce_flags.smca) {ERROR: "mce_flags" [drivers/edac/edac_mce_amd.ko] undefined!
make[1]: *** [__modpost] Error 1
make: *** [modules] Error 2
make: *** Waiting for unfinished jobs....
Just read CPUID again here instead of exporting mce_flags.
+ u32 smca_low, smca_high;s/smca_//
+ u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
Also, all those MSRs should be defined in drivers/edac/mce_amd.h if not
used outside.
+No need for that break here.
+ if (!rdmsr_safe(smca_addr, &smca_low, &smca_high) &&
+ (smca_low & MCI_CONFIG_MCAX))
+ pr_cont("|%s",
If for some reason the CPUID bit is not set, then we should not assume the processor supports the features right?
+ case 0x17:What is that supposed to do? I thought all new ones will have SMCA...
+ xec_mask = 0x3f;
+ if (!mce_flags.smca) {
+ printk(KERN_WARNING "Decoding supported only on Scalable MCA enabled processors\n");