[PATCH v2 10/24] EDAC, mc: Remove per layer counters

From: Robert Richter
Date: Mon Jun 24 2019 - 11:10:17 EST


Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
turns out that only the leaves in the memory hierarchy are consumed
(in sysfs), but not the intermediate layers, e.g.:

count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];

So let's get rid of the unused counters that just add complexity.

Error counter values are directly stored in struct dimm_info now.

Signed-off-by: Robert Richter <rrichter@xxxxxxxxxxx>
---
drivers/edac/edac_mc.c | 98 ++++++++++++------------------------
drivers/edac/edac_mc_sysfs.c | 20 +++-----
drivers/edac/ghes_edac.c | 5 +-
include/linux/edac.h | 7 ++-
4 files changed, 44 insertions(+), 86 deletions(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f2acdab34eb7..bce39b2e10c9 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -313,10 +313,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
struct csrow_info *csr;
struct rank_info *chan;
struct dimm_info *dimm;
- u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
unsigned pos[EDAC_MAX_LAYERS];
- unsigned size, tot_dimms = 1, count = 1;
- unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
+ unsigned size, tot_dimms = 1;
+ unsigned tot_csrows = 1, tot_channels = 1;
void *pvt, *p, *ptr = NULL;
int idx, i, j, row, chn, n, len;
bool per_rank = false;
@@ -342,19 +341,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
* stringent as what the compiler would provide if we could simply
* hardcode everything into a single struct.
*/
- mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
- layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
- for (i = 0; i < n_layers; i++) {
- count *= layers[i].size;
- edac_dbg(4, "errcount layer %d size %d\n", i, count);
- ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
- ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
- tot_errcount += 2 * count;
- }
-
- edac_dbg(4, "allocating %d error counters\n", tot_errcount);
- pvt = edac_align_ptr(&ptr, sz_pvt, 1);
- size = ((unsigned long)pvt) + sz_pvt;
+ mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
+ layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+ pvt = edac_align_ptr(&ptr, sz_pvt, 1);
+ size = ((unsigned long)pvt) + sz_pvt;

edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
size,
@@ -370,10 +360,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
* rather than an imaginary chunk of memory located at address 0.
*/
layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
- for (i = 0; i < n_layers; i++) {
- mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
- mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
- }
pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;

/* setup index and various internal pointers */
@@ -903,53 +889,31 @@ const char *edac_layer_name[] = {
EXPORT_SYMBOL_GPL(edac_layer_name);

static void edac_inc_ce_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
{
- int i, index = 0;
+ struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);

mci->ce_mc += count;

- if (!enable_per_layer_report) {
+ if (dimm)
+ dimm->ce_count += count;
+ else
mci->ce_noinfo_count += count;
- return;
- }
-
- for (i = 0; i < mci->n_layers; i++) {
- if (pos[i] < 0)
- break;
- index += pos[i];
- mci->ce_per_layer[i][index] += count;
-
- if (i < mci->n_layers - 1)
- index *= mci->layers[i + 1].size;
- }
}

static void edac_inc_ue_error(struct mem_ctl_info *mci,
- bool enable_per_layer_report,
const int pos[EDAC_MAX_LAYERS],
const u16 count)
{
- int i, index = 0;
+ struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);

mci->ue_mc += count;

- if (!enable_per_layer_report) {
+ if (dimm)
+ dimm->ue_count += count;
+ else
mci->ue_noinfo_count += count;
- return;
- }
-
- for (i = 0; i < mci->n_layers; i++) {
- if (pos[i] < 0)
- break;
- index += pos[i];
- mci->ue_per_layer[i][index] += count;
-
- if (i < mci->n_layers - 1)
- index *= mci->layers[i + 1].size;
- }
}

static void edac_ce_error(struct mem_ctl_info *mci,
@@ -960,7 +924,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
const char *label,
const char *detail,
const char *other_detail,
- const bool enable_per_layer_report,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
long grain)
@@ -983,7 +946,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
error_count, msg, msg_aux, label,
location, detail);
}
- edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
+ edac_inc_ce_error(mci, pos, error_count);

if (mci->scrub_mode == SCRUB_SW_SRC) {
/*
@@ -1013,8 +976,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
const char *location,
const char *label,
const char *detail,
- const char *other_detail,
- const bool enable_per_layer_report)
+ const char *other_detail)
{
char *msg_aux = "";

@@ -1043,7 +1005,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
msg, msg_aux, label, location, detail);
}

- edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
+ edac_inc_ue_error(mci, pos, error_count);
}

void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
@@ -1076,16 +1038,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
e->page_frame_number, e->offset_in_page,
e->grain, e->syndrome);
- edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
- detail, e->other_detail, e->enable_per_layer_report,
+ edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
+ e->label, detail, e->other_detail,
e->page_frame_number, e->offset_in_page, e->grain);
} else {
snprintf(detail, sizeof(detail),
"page:0x%lx offset:0x%lx grain:%ld",
e->page_frame_number, e->offset_in_page, e->grain);

- edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
- detail, e->other_detail, e->enable_per_layer_report);
+ edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
+ e->label, detail, e->other_detail);
}


@@ -1110,6 +1072,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
int i, n_labels = 0;
struct edac_raw_error_desc *e = &mci->error_desc;
+ bool per_layer_report = false;

edac_dbg(3, "MC%d\n", mci->mc_idx);

@@ -1127,9 +1090,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,

/*
* Check if the event report is consistent and if the memory
- * location is known. If it is known, enable_per_layer_report will be
- * true, the DIMM(s) label info will be filled and the per-layer
- * error counters will be incremented.
+ * location is known. If it is known, the DIMM(s) label info
+ * will be filled and the per-layer error counters will be
+ * incremented.
*/
for (i = 0; i < mci->n_layers; i++) {
if (pos[i] >= (int)mci->layers[i].size) {
@@ -1147,7 +1110,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
pos[i] = -1;
}
if (pos[i] >= 0)
- e->enable_per_layer_report = true;
+ per_layer_report = true;
}

/*
@@ -1176,15 +1139,18 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
if (dimm->grain > e->grain)
e->grain = dimm->grain;

+ if (!per_layer_report)
+ continue;
+
/*
* If the error is memory-controller wide, there's no need to
* seek for the affected DIMMs because the whole
* channel/memory controller/... may be affected.
* Also, don't show errors for empty DIMM slots.
*/
- if (e->enable_per_layer_report && dimm->nr_pages) {
+ if (dimm->nr_pages) {
if (n_labels >= EDAC_MAX_LABELS) {
- e->enable_per_layer_report = false;
+ per_layer_report = false;
break;
}
n_labels++;
@@ -1215,7 +1181,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
}
}

- if (!e->enable_per_layer_report) {
+ if (!per_layer_report) {
strcpy(e->label, "any memory");
} else {
edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 4d15c88a52cd..a4c1b8501ff3 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -558,10 +558,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
- u32 count;

- count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
- return sprintf(data, "%u\n", count);
+ return sprintf(data, "%u\n", dimm->ce_count);
}

static ssize_t dimmdev_ue_count_show(struct device *dev,
@@ -569,10 +567,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
- u32 count;

- count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
- return sprintf(data, "%u\n", count);
+ return sprintf(data, "%u\n", dimm->ue_count);
}

/* dimm/rank attribute files */
@@ -660,7 +656,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
const char *data, size_t count)
{
struct mem_ctl_info *mci = to_mci(dev);
- int cnt, row, chan, i;
+ struct dimm_info *dimm;
+ int row, chan;
+
mci->ue_mc = 0;
mci->ce_mc = 0;
mci->ue_noinfo_count = 0;
@@ -676,11 +674,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
ri->channels[chan]->ce_count = 0;
}

- cnt = 1;
- for (i = 0; i < mci->n_layers; i++) {
- cnt *= mci->layers[i].size;
- memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
- memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
+ mci_for_each_dimm(mci, dimm) {
+ dimm->ue_count = 0;
+ dimm->ce_count = 0;
}

mci->start_time = jiffies;
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 72e75ea5526c..757a02f2ce49 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -348,11 +348,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
mem_err->mem_dev_handle);

index = get_dimm_smbios_index(mem_err->mem_dev_handle);
- if (index >= 0) {
+ if (index >= 0)
e->top_layer = index;
- e->enable_per_layer_report = true;
- }
-
}
if (p > e->location)
*(p - 1) = '\0';
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 20a04f48616c..4dcf075e9dff 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -383,6 +383,9 @@ struct dimm_info {
unsigned csrow, cschannel; /* Points to the old API data */

u16 smbios_handle; /* Handle for SMBIOS type 17 */
+
+ u32 ce_count;
+ u32 ue_count;
};

/**
@@ -453,8 +456,6 @@ struct errcount_attribute_data {
* @location: location of the error
* @label: label of the affected DIMM(s)
* @other_detail: other driver-specific detail about the error
- * @enable_per_layer_report: if false, the error affects all layers
- * (typically, a memory controller error)
*/
struct edac_raw_error_desc {
/*
@@ -475,7 +476,6 @@ struct edac_raw_error_desc {
unsigned long syndrome;
const char *msg;
const char *other_detail;
- bool enable_per_layer_report;
};

/* MEMORY controller information structure
@@ -565,7 +565,6 @@ struct mem_ctl_info {
*/
u32 ce_noinfo_count, ue_noinfo_count;
u32 ue_mc, ce_mc;
- u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];

struct completion complete;

--
2.20.1