[PATCH 19/21] EDAC, ghes: Identify dimm by node, card, module and handle

From: Robert Richter
Date: Wed May 29 2019 - 04:48:42 EST


According to SMBIOS Spec. 2.7 (N.2.5 Memory Error Section), a failing
DIMM (module or rank number) can be identified by its error location
consisting of node, card and module. A module handle is used to map it
to the dimms listed in the dmi table. Collect all those data from the
error record and select the dimm accordingly. Inconsistent error
records will be reported which is the case if the same dimm handle
reports errors with different node, card or module.

The change allows to enable per-layer reporting based on node, card
and module in the next patch.

Signed-off-by: Robert Richter <rrichter@xxxxxxxxxxx>
---
drivers/edac/ghes_edac.c | 74 +++++++++++++++++++++++++++++++++-------
1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 4bac643d3404..07c847ed7315 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -83,8 +83,11 @@ struct memarr_dmi_entry {

struct ghes_dimm_info {
struct dimm_info dimm_info;
+ struct dimm_info *dimm;
int idx;
int numa_node;
+ int card;
+ int module;
phys_addr_t start;
phys_addr_t end;
u16 phys_handle;
@@ -119,6 +122,8 @@ static void ghes_dimm_info_init(void)
for_each_dimm(dimm) {
dimm->idx = idx;
dimm->numa_node = NUMA_NO_NODE;
+ dimm->card = -1;
+ dimm->module = -1;
idx++;
}
}
@@ -401,6 +406,13 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)

if (*dmi_dimm->label)
strcpy(mci_dimm->label, dmi_dimm->label);
+
+ /*
+ * From here on do not use any longer &dimm.dimm_info.
+ * Instead switch to the mci's dimm info which might
+ * contain updated data, such as the label.
+ */
+ dimm->dimm = mci_dimm;
}

if (index != mci->tot_dimms)
@@ -408,24 +420,46 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
index, mci->tot_dimms);
}

-static struct mem_ctl_info *get_mc_by_node(int nid)
+/* Requires ghes_lock being set. */
+static struct ghes_dimm_info *
+get_and_prepare_dimm_info(int nid, int card, int module, int handle)
{
- struct mem_ctl_info *mci = edac_mc_find(nid);
+ static struct ghes_dimm_info *dimm;
+ struct dimm_info *di;

- if (mci)
- return mci;
+ /*
+ * We require smbios_handle being set in the error report for
+ * per layer reporting (SMBIOS handle for the Type 17 Memory
+ * Device Structure that represents the Memory Module)
+ */
+ for_each_dimm(dimm) {
+ di = dimm->dimm;
+ if (di->smbios_handle == handle)
+ goto found;
+ }

- if (num_possible_nodes() > 1) {
- edac_mc_printk(fallback, KERN_WARNING,
- "Invalid or no node information, falling back to first node: %s",
- fallback->dev_name);
+ return NULL;
+found:
+ if (dimm->card < 0 && card >= 0)
+ dimm->card = card;
+ if (dimm->module < 0 && module >= 0)
+ dimm->module = module;
+
+ if ((num_possible_nodes() > 1 && di->mci->mc_idx != nid) ||
+ (card >= 0 && card != dimm->card) ||
+ (module >= 0 && module != dimm->module)) {
+ edac_mc_printk(di->mci, KERN_WARNING,
+ "Inconsistent error report (nid/card/module): %d/%d/%d (dimm%d: %d/%d/%d)",
+ nid, card, module, di->idx,
+ di->mci->mc_idx, dimm->card, dimm->module);
}

- return fallback;
+ return dimm;
}

void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
+ struct ghes_dimm_info *dimm;
struct dimm_info *dimm_info;
enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
@@ -434,6 +468,9 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
unsigned long flags;
char *p;
int nid = NUMA_NO_NODE;
+ int card = -1;
+ int module = -1;
+ int handle = -1;

/* We need at least one mc */
if (WARN_ON_ONCE(!fallback))
@@ -449,10 +486,23 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)

spin_lock_irqsave(&ghes_lock, flags);

- /* select the node's mc device */
if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
nid = mem_err->node;
- mci = get_mc_by_node(nid);
+ if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
+ card = mem_err->card;
+ if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
+ module = mem_err->module;
+ if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)
+ handle = mem_err->mem_dev_handle;
+
+ dimm = get_and_prepare_dimm_info(nid, card, module, handle);
+ if (dimm)
+ mci = dimm->dimm->mci;
+ else
+ mci = edac_mc_find(nid);
+ if (!mci)
+ mci = fallback;
+
pvt = mci->pvt_info;
e = &mci->error_desc;

@@ -670,7 +720,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';

- dimm_info = edac_get_dimm_by_index(mci, e->top_layer);
+ dimm_info = dimm ? dimm->dimm : NULL;

edac_raw_mc_handle_error(type, mci, dimm_info, e, -1, -1);

--
2.20.1