[PATCH 2/2] edac: Add support to handle DE (Deferred Errors)

From: Kamati Srinivas
Date: Fri Mar 03 2023 - 07:55:00 EST


The EDAC subsystem doesn't handle DE (Deferred errors),
due to lack of support all DEs are treated either as
CEs (Corrected Errors) or UEs (Uncorrected Errors).
To solve this adding log, counter and associated sysfs
entries to allow EDAC driver to be configured to
handle DEs.

Signed-off-by: Kamati Srinivas <quic_kamasrin@xxxxxxxxxxx>
---
drivers/edac/edac_device.c | 53 +++++++++++++++++++++++++++++++-
drivers/edac/edac_device.h | 30 ++++++++++++++++++
drivers/edac/edac_device_sysfs.c | 38 +++++++++++++++++++++++
3 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index ddfa094d0f3a..28c355d07304 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -131,9 +131,10 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info(
dev_ctl->instances = dev_inst;
dev_ctl->pvt_info = pvt;

- /* Default logging of CEs and UEs */
+ /* Default logging of CEs, UEs and DEs */
dev_ctl->log_ce = 1;
dev_ctl->log_ue = 1;
+ dev_ctl->log_de = 1;

/* Name of this edac device */
snprintf(dev_ctl->name,sizeof(dev_ctl->name),"%s",edac_device_name);
@@ -544,6 +545,11 @@ static inline bool edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
return edac_dev->log_ce;
}

+static inline bool edac_device_get_log_de(struct edac_device_ctl_info *edac_dev)
+{
+ return edac_dev->log_de;
+}
+
static inline bool edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
{
return edac_dev->log_ue;
@@ -601,6 +607,51 @@ void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
}
EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);

+void edac_device_handle_de_count(struct edac_device_ctl_info *edac_dev,
+ unsigned int count, int inst_nr, int block_nr,
+ const char *msg)
+{
+ struct edac_device_instance *instance;
+ struct edac_device_block *block = NULL;
+
+ if (!count)
+ return;
+
+ if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
+ edac_device_printk(edac_dev, KERN_ERR,
+ "INTERNAL ERROR: 'instance' out of range (%d >= %d)\n",
+ inst_nr,
+ edac_dev->nr_instances);
+ return;
+ }
+
+ instance = edac_dev->instances + inst_nr;
+
+ if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
+ edac_device_printk(edac_dev, KERN_ERR,
+ "INTERNAL ERROR: instance %d 'block'out of range (%d >= %d)\n",
+ inst_nr, block_nr,
+ instance->nr_blocks);
+ return;
+ }
+
+ if (instance->nr_blocks > 0) {
+ block = instance->blocks + block_nr;
+ block->counters.de_count += count;
+ }
+
+ /* Propagate the count up the 'totals' tree */
+ instance->counters.de_count += count;
+ edac_dev->counters.de_count += count;
+
+ if (edac_device_get_log_de(edac_dev))
+ edac_device_printk(edac_dev, KERN_WARNING,
+ "DE: %s instance: %s block: %s count: %d '%s'\n",
+ edac_dev->ctl_name, instance->name,
+ block ? block->name : "N/A", count, msg);
+}
+EXPORT_SYMBOL_GPL(edac_device_handle_de_count);
+
void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
unsigned int count, int inst_nr, int block_nr,
const char *msg)
diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index e1645332eaa2..883557bc0182 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -74,6 +74,7 @@
struct edac_device_counter {
u32 ue_count;
u32 ce_count;
+ u32 de_count;
};

/* forward reference */
@@ -159,6 +160,7 @@ struct edac_device_ctl_info {
/* Per instance controls for this edac_device */
bool log_ue; /* boolean for logging UEs */
bool log_ce; /* boolean for logging CEs */
+ bool log_de; /* boolean for logging DEs */
int panic_on_ue; /* boolean for panic'ing on an UE */
unsigned poll_msec; /* number of milliseconds to poll interval */
unsigned long delay; /* number of jiffies for poll_msec */
@@ -298,6 +300,19 @@ void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
unsigned int count, int inst_nr, int block_nr,
const char *msg);

+/**
+ * Log Deferred errors.
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @inst_nr: number of the instance where the DE error happened
+ * @count: Number of errors to log.
+ * @block_nr: number of the block where the DE error happened
+ * @msg: message to be printed
+ */
+void edac_device_handle_de_count(struct edac_device_ctl_info *edac_dev,
+ unsigned int count, int inst_nr, int block_nr,
+ const char *msg);
+
/**
* Log uncorrectable errors.
*
@@ -341,6 +356,21 @@ edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, int inst_nr,
edac_device_handle_ue_count(edac_dev, 1, inst_nr, block_nr, msg);
}

+/**
+ * edac_device_handle_de(): Log deferred error
+ *
+ * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @inst_nr: number of the instance where the DE error happened
+ * @block_nr: number of the block where the DE error happened
+ * @msg: message to be printed
+ */
+static inline void
+edac_device_handle_de(struct edac_device_ctl_info *edac_dev, int inst_nr,
+ int block_nr, const char *msg)
+{
+ edac_device_handle_de_count(edac_dev, 1, inst_nr, block_nr, msg);
+}
+
/**
* edac_device_alloc_index: Allocate a unique device index number
*
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 51a3a90d7404..76fc50ff8503 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -62,6 +62,23 @@ static ssize_t edac_device_ctl_log_ce_store(struct edac_device_ctl_info
return ret ? ret : count;
}

+/* 'log_de' */
+static ssize_t edac_device_ctl_log_de_show(struct edac_device_ctl_info
+ *ctl_info, char *data)
+{
+ return sprintf(data, "%u\n", ctl_info->log_de);
+}
+
+static ssize_t edac_device_ctl_log_de_store(struct edac_device_ctl_info
+ *ctl_info, const char *data,
+ size_t count)
+{
+ int ret;
+
+ ret = kstrtobool(data, &ctl_info->log_de);
+ return ret ? ret : count;
+}
+
/* 'panic_on_ue' */
static ssize_t edac_device_ctl_panic_on_ue_show(struct edac_device_ctl_info
*ctl_info, char *data)
@@ -156,6 +173,8 @@ CTL_INFO_ATTR(log_ue, S_IRUGO | S_IWUSR,
edac_device_ctl_log_ue_show, edac_device_ctl_log_ue_store);
CTL_INFO_ATTR(log_ce, S_IRUGO | S_IWUSR,
edac_device_ctl_log_ce_show, edac_device_ctl_log_ce_store);
+CTL_INFO_ATTR(log_de, S_IRUGO | S_IWUSR,
+ edac_device_ctl_log_de_show, edac_device_ctl_log_de_store);
CTL_INFO_ATTR(panic_on_ue, S_IRUGO | S_IWUSR,
edac_device_ctl_panic_on_ue_show,
edac_device_ctl_panic_on_ue_store);
@@ -167,6 +186,7 @@ static struct ctl_info_attribute *device_ctrl_attr[] = {
&attr_ctl_info_panic_on_ue,
&attr_ctl_info_log_ue,
&attr_ctl_info_log_ce,
+ &attr_ctl_info_log_de,
&attr_ctl_info_poll_msec,
NULL,
};
@@ -318,6 +338,12 @@ static ssize_t instance_ce_count_show(struct edac_device_instance *instance,
return sprintf(data, "%u\n", instance->counters.ce_count);
}

+static ssize_t instance_de_count_show(struct edac_device_instance *instance,
+ char *data)
+{
+ return sprintf(data, "%u\n", instance->counters.de_count);
+}
+
#define to_instance(k) container_of(k, struct edac_device_instance, kobj)
#define to_instance_attr(a) container_of(a,struct instance_attribute,attr)

@@ -387,11 +413,13 @@ static struct instance_attribute attr_instance_##_name = { \
*/
INSTANCE_ATTR(ce_count, S_IRUGO, instance_ce_count_show, NULL);
INSTANCE_ATTR(ue_count, S_IRUGO, instance_ue_count_show, NULL);
+INSTANCE_ATTR(de_count, S_IRUGO, instance_de_count_show, NULL);

/* list of edac_dev 'instance' attributes */
static struct instance_attribute *device_instance_attr[] = {
&attr_instance_ce_count,
&attr_instance_ue_count,
+ &attr_instance_de_count,
NULL,
};

@@ -427,6 +455,14 @@ static ssize_t block_ce_count_show(struct kobject *kobj,
return sprintf(data, "%u\n", block->counters.ce_count);
}

+static ssize_t block_de_count_show(struct kobject *kobj,
+ struct attribute *attr, char *data)
+{
+ struct edac_device_block *block = to_block(kobj);
+
+ return sprintf(data, "%u\n", block->counters.de_count);
+}
+
/* DEVICE block kobject release() function */
static void edac_device_ctrl_block_release(struct kobject *kobj)
{
@@ -485,11 +521,13 @@ static struct edac_dev_sysfs_block_attribute attr_block_##_name = { \

BLOCK_ATTR(ce_count, S_IRUGO, block_ce_count_show, NULL);
BLOCK_ATTR(ue_count, S_IRUGO, block_ue_count_show, NULL);
+BLOCK_ATTR(de_count, S_IRUGO, block_de_count_show, NULL);

/* list of edac_dev 'block' attributes */
static struct edac_dev_sysfs_block_attribute *device_block_attr[] = {
&attr_block_ce_count,
&attr_block_ue_count,
+ &attr_block_de_count,
NULL,
};

--
2.17.1