[RFC PATCH 3/4] EDAC/ghes: Add EDAC device for the CPU caches

From: Shiju Jose
Date: Thu Nov 05 2020 - 13:01:27 EST


Find CPU caches in the ACPI PPTT and add CPU EDAC device
and EDAC device blocks for the caches found.

For the firmware-first error handling, add an interface in the
ghes_edac, enable to report the CPU corrected error count for
a CPU core to the user-space through the CPU EDAC device.

Suggested-by: James Morse <james.morse@xxxxxxx>
Signed-off-by: Jonathan Cameron <jonathan.cameron@xxxxxxxxxx>
Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx>
---
drivers/edac/Kconfig | 10 +++
drivers/edac/ghes_edac.c | 135 +++++++++++++++++++++++++++++++++++++++
include/acpi/ghes.h | 27 ++++++++
3 files changed, 172 insertions(+)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 7a47680d6f07..3a0d8d134dcc 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -74,6 +74,16 @@ config EDAC_GHES

In doubt, say 'Y'.

+config EDAC_GHES_CPU_ERROR
+ bool "EDAC device for reporting firmware-first BIOS detected CPU error count"
+ depends on EDAC_GHES && ACPI_PPTT
+ help
+ EDAC device for the firmware-first BIOS detected CPU error count reported
+ via ACPI APEI/GHES. By enabling this option, EDAC device for the CPU
+ hierarchy and EDAC device blocks for caches hierarchy would be created.
+ The cpu error count is shared with the userspace via the CPU EDAC
+ device's sysfs interface.
+
config EDAC_AMD64
tristate "AMD64 (Opteron, Athlon64)"
depends on AMD_NB && EDAC_DECODE_MCE
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index a918ca93e4f7..96619483e5f3 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -12,6 +12,9 @@
#include <acpi/ghes.h>
#include <linux/edac.h>
#include <linux/dmi.h>
+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+#include <linux/cacheinfo.h>
+#endif
#include "edac_module.h"
#include <ras/ras_event.h>

@@ -497,6 +500,130 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
spin_unlock_irqrestore(&ghes_lock, flags);
}

+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+#define MAX_NUM_CACHES 20
+static struct ghes_edac_cpu_block {
+ int cpu;
+ u8 level;
+ u8 type;
+ int block_nr;
+} *cpu_edac_block_list;
+
+static struct edac_device_ctl_info *cpu_edac_dev;
+static int max_number_of_caches;
+
+void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo)
+{
+ struct ghes_edac_cpu_block *block;
+ int i;
+
+ if (!einfo || !(einfo->ce_count) || !max_number_of_caches)
+ return;
+
+ for (i = 0; i < max_number_of_caches; i++) {
+ block = cpu_edac_block_list + (einfo->cpu * max_number_of_caches) + i;
+ if ((block->level == einfo->cache_level) && (block->type == einfo->cache_type)) {
+ edac_device_handle_ce_count(cpu_edac_dev, einfo->ce_count,
+ einfo->cpu, block->block_nr, "");
+ break;
+ }
+ }
+}
+
+static int ghes_edac_add_cpu_device(struct device *dev)
+{
+ int rc;
+
+ cpu_edac_dev = edac_device_alloc_ctl_info(0, "cpu", num_possible_cpus(),
+ "cache", max_number_of_caches, 0, NULL,
+ 0, edac_device_alloc_index());
+ if (!cpu_edac_dev) {
+ pr_warn("edac_device_alloc_ctl_info for cpu_edac_dev failed\n");
+ return -ENOMEM;
+ }
+
+ cpu_edac_dev->dev = dev;
+ cpu_edac_dev->ctl_name = "cpu_edac_dev";
+ cpu_edac_dev->dev_name = "ghes";
+ cpu_edac_dev->mod_name = "ghes_edac.c";
+ rc = edac_device_add_device(cpu_edac_dev);
+ if (rc) {
+ pr_warn("edac_device_add_device failed\n");
+ edac_device_free_ctl_info(cpu_edac_dev);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void ghes_edac_delete_cpu_device(void)
+{
+ max_number_of_caches = 0;
+ if (cpu_edac_dev) {
+ edac_device_del_device(cpu_edac_dev->dev);
+ edac_device_free_ctl_info(cpu_edac_dev);
+ }
+ vfree(cpu_edac_block_list);
+}
+
+static void ghes_edac_create_cpu_device(struct device *dev)
+{
+ int cpu, i;
+ struct ghes_edac_cpu_block *block;
+ int number_of_caches;
+ struct acpi_cacheinfo cacheinfo[MAX_NUM_CACHES];
+
+ /* Find the maximum number of caches present in the cpu heirarchy among the CPUs */
+ for_each_possible_cpu(cpu) {
+ number_of_caches = acpi_find_cache_info(cpu, &cacheinfo[0], MAX_NUM_CACHES);
+ if (number_of_caches <= 0)
+ return;
+
+ if (max_number_of_caches < number_of_caches)
+ max_number_of_caches = number_of_caches;
+ }
+ if (!max_number_of_caches)
+ return;
+
+ /*
+ * EDAC device interface supports creating the CPU hierarchy for all the CPUs
+ * together. Thus need to allocate cpu_edac_block_list for the max_number_of_caches
+ * among all the CPU hierarchy irrespective of the number of caches per CPU might vary.
+ */
+ cpu_edac_block_list = vzalloc(num_possible_cpus() * max_number_of_caches *
+ sizeof(*cpu_edac_block_list));
+ if (!cpu_edac_block_list)
+ return;
+
+ if (ghes_edac_add_cpu_device(dev))
+ goto error;
+
+ for_each_possible_cpu(cpu) {
+ memset(cacheinfo, 0, MAX_NUM_CACHES * sizeof(struct acpi_cacheinfo));
+ number_of_caches = acpi_find_cache_info(cpu, &cacheinfo[0], MAX_NUM_CACHES);
+ if (number_of_caches <= 0)
+ goto error;
+ /*
+ * The edac cpu cache device blocks entries in the sysfs should match with the cpu
+ * cache structure in the sysfs so that the affected cpus for a shared cache
+ * can be easily extracted in the userspace.
+ */
+ for (i = 0; i < number_of_caches; i++) {
+ block = cpu_edac_block_list + (cpu * max_number_of_caches) + i;
+ block->cpu = cpu;
+ block->level = cacheinfo[i].level;
+ block->type = cacheinfo[i].type;
+ block->block_nr = i;
+ }
+ }
+
+ return;
+
+error:
+ ghes_edac_delete_cpu_device();
+}
+#endif
+
/*
* Known systems that are safe to enable this module.
*/
@@ -624,6 +751,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
ghes_pvt = pvt;
spin_unlock_irqrestore(&ghes_lock, flags);

+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+ ghes_edac_create_cpu_device(dev);
+#endif
+
/* only set on success */
refcount_set(&ghes_refcount, 1);

@@ -654,6 +785,10 @@ void ghes_edac_unregister(struct ghes *ghes)
if (!refcount_dec_and_test(&ghes_refcount))
goto unlock;

+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+ ghes_edac_delete_cpu_device();
+#endif
+
/*
* Wait for the irq handler being finished.
*/
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 34fb3431a8f3..a9098daf53d4 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -73,6 +73,24 @@ void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);

int ghes_estatus_pool_init(int num_ghes);

+/*
+ * struct ghes_einfo_cpu - structure to pass cpu error info to the edac
+ * @cpu: CPU index.
+ * @error_type: error type, cache/TLB/bus/ etc.
+ * @cache_level: cache level.
+ * @cache_type: ACPI cache type.
+ * @ue_count: CPU uncorrectable error count.
+ * @ce_count: CPU correctable error count.
+ */
+struct ghes_einfo_cpu {
+ int cpu;
+ u8 error_type;
+ u8 cache_level;
+ u8 cache_type;
+ u16 ue_count;
+ u16 ce_count;
+};
+
/* From drivers/edac/ghes_edac.c */

#ifdef CONFIG_EDAC_GHES
@@ -98,6 +116,15 @@ static inline void ghes_edac_unregister(struct ghes *ghes)
}
#endif

+#ifdef CONFIG_EDAC_GHES_CPU_ERROR
+void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo_cpu);
+
+#else
+static inline void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo_cpu)
+{
+}
+#endif
+
static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
{
return gdata->revision >> 8;
--
2.17.1