Re: [PATCH 2/2] drivers: edac: Add EDAC support for Kryo CPU caches
From: Evan Green
Date: Wed Dec 11 2019 - 14:38:59 EST
Hi Sai,
On Thu, Dec 5, 2019 at 1:53 AM Sai Prakash Ranjan
<saiprakash.ranjan@xxxxxxxxxxxxxx> wrote:
>
> Kryo{3,4}XX CPU cores implement RAS extensions to support
> Error Correcting Code(ECC). Currently all Kryo{3,4}XX CPU
> cores (gold/silver a.k.a big/LITTLE) support ECC via RAS.
> This adds an interrupt based driver for those CPUs and
> provides an optional polling of error recording system
> registers.
>
> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx>
> ---
> MAINTAINERS | 7 +
> drivers/edac/Kconfig | 20 +
> drivers/edac/Makefile | 1 +
> drivers/edac/qcom_kryo_edac.c | 679 ++++++++++++++++++++++++++++++++++
> 4 files changed, 707 insertions(+)
> create mode 100644 drivers/edac/qcom_kryo_edac.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index c2d80079dccc..f58c93f963f6 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -6049,6 +6049,13 @@ L: linux-edac@xxxxxxxxxxxxxxx
> S: Maintained
> F: drivers/edac/qcom_edac.c
>
> +EDAC-KRYO-QCOM
> +M: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx>
> +L: linux-arm-msm@xxxxxxxxxxxxxxx
> +L: linux-edac@xxxxxxxxxxxxxxx
> +S: Maintained
> +F: drivers/edac/qcom_kryo_edac.c
> +
> EDIROL UA-101/UA-1000 DRIVER
> M: Clemens Ladisch <clemens@xxxxxxxxxx>
> L: alsa-devel@xxxxxxxxxxxxxxxx (moderated for non-subscribers)
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 417dad635526..cd78ac2917c9 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -508,6 +508,26 @@ config EDAC_QCOM
> For debugging issues having to do with stability and overall system
> health, you should probably say 'Y' here.
>
> +config EDAC_QCOM_KRYO
> + tristate "QCOM Kryo EDAC for CPU L1/L2/L3-SCU caches"
> + depends on ARCH_QCOM && ARM64_RAS_EXTN
> + help
> + Support for Error detection and correction on Kryo Gold and Silver CPU
> + cores with RAS extensions. Currently it detects and reports all Single
> + Bit Errors (SBEs) and Double Bit Errors (DBEs).
> +
> + For debugging issues having to do with stability and overall system
> + health, you should probably say 'Y' here.
> +
> +config EDAC_QCOM_KRYO_POLL
> + depends on EDAC_QCOM_KRYO
> + bool "Poll on Kryo ECC registers"
> + help
> + This option chooses whether or not you want to poll on the Kryo ECC
> + registers. When this is enabled, the polling rate can be set as a
> + module parameter. By default, it will call the polling function every
> + second.
> +
> config EDAC_ASPEED
> tristate "Aspeed AST 2500 SoC"
> depends on MACH_ASPEED_G5
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index d77200c9680b..29edcfa6ec0e 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -85,5 +85,6 @@ obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o
> obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o
> obj-$(CONFIG_EDAC_TI) += ti_edac.o
> obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
> +obj-$(CONFIG_EDAC_QCOM_KRYO) += qcom_kryo_edac.o
> obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o
> obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o
> diff --git a/drivers/edac/qcom_kryo_edac.c b/drivers/edac/qcom_kryo_edac.c
> new file mode 100644
> index 000000000000..05b60ad3cb0e
> --- /dev/null
> +++ b/drivers/edac/qcom_kryo_edac.c
> @@ -0,0 +1,679 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2019, The Linux Foundation. All rights reserved.
> + */
> +
> +#include <linux/bitfield.h>
> +#include <linux/cpu_pm.h>
> +#include <linux/of_irq.h>
> +#include <linux/smp.h>
> +
> +#include <asm/cputype.h>
> +#include <asm/sysreg.h>
> +
> +#include "edac_device.h"
> +#include "edac_mc.h"
> +
> +#define DRV_NAME "qcom_kryo_edac"
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.3
> + * ARM DSU TRM Chapter B2.3
> + * CFI = Corrected Fault Handling interrupt, FI = Fault handling interrupt
> + * UI = Uncorrected error recovery interrupt, ED = Error Detection
> + */
> +#define KRYO_ERRXCTLR_ED BIT(0)
> +#define KRYO_ERRXCTLR_UI BIT(2)
> +#define KRYO_ERRXCTLR_FI BIT(3)
> +#define KRYO_ERRXCTLR_CFI BIT(8)
> +#define KRYO_ERRXCTLR_ENABLE (KRYO_ERRXCTLR_CFI | KRYO_ERRXCTLR_FI | \
> + KRYO_ERRXCTLR_UI | KRYO_ERRXCTLR_ED)
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.4
> + * ARM DSU TRM Chapter B2.4
> + */
> +#define KRYO_ERRXFR_ED GENMASK(1, 0)
> +#define KRYO_ERRXFR_DE GENMASK(3, 2)
> +#define KRYO_ERRXFR_UI GENMASK(5, 4)
> +#define KRYO_ERRXFR_FI GENMASK(7, 6)
> +#define KRYO_ERRXFR_UE GENMASK(9, 8)
> +#define KRYO_ERRXFR_CFI GENMASK(11, 10)
> +#define KRYO_ERRXFR_CEC GENMASK(14, 12)
> +#define KRYO_ERRXFR_RP BIT(15)
> +#define KRYO_ERRXFR_SUPPORTED (KRYO_ERRXFR_ED | KRYO_ERRXFR_DE | \
> + KRYO_ERRXFR_UI | KRYO_ERRXFR_FI | \
> + KRYO_ERRXFR_UE | KRYO_ERRXFR_CFI | \
> + KRYO_ERRXFR_CEC | KRYO_ERRXFR_RP)
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.5
> + * ARM DSU TRM Chapter B2.5
> + */
> +#define KRYO_ERRXMISC0_CECR GENMASK_ULL(38, 32)
> +#define KRYO_ERRXMISC0_CECO GENMASK_ULL(46, 40)
> +
> +/* ARM Cortex-A76 TRM Chapter B3.5 */
> +#define KRYO_ERRXMISC0_UNIT GENMASK(3, 0)
> +#define KRYO_ERRXMISC0_LVL GENMASK(3, 1)
> +
> +/* ARM Cortex-A76 TRM Chapter B3.10 has SERR bitfields 4:0
> + * but Cortex-A55, Cortex-A75 and DSU TRM has SERR bitfields 7:0.
> + * Since max error record is 21, we can use bitfields 4:0 for
> + * Kryo{3,4}XX CPUs.
> + */
> +#define KRYO_ERRXSTATUS_SERR GENMASK(4, 0)
> +#define KRYO_ERRXSTATUS_DE BIT(23)
> +#define KRYO_ERRXSTATUS_CE GENMASK(25, 24)
> +#define KRYO_ERRXSTATUS_MV BIT(26)
> +#define KRYO_ERRXSTATUS_UE BIT(29)
> +#define KRYO_ERRXSTATUS_VALID BIT(30)
> +
> +/* ARM Cortex-A76 TRM Chapter B3.5
> + * IC = Instruction Cache, DC = Data Cache
> + */
> +#define KRYO_L1_UNIT_IC 0x1
> +#define KRYO_L2_UNIT_TLB 0x2
> +#define KRYO_L1_UNIT_DC 0x4
> +#define KRYO_L2_UNIT 0x8
> +
> +/*
> + * ARM Cortex-A55 TRM Chapter B2.36
> + * ARM Cortex-A75, Cortex-A76 TRM Chapter B2.37
> + */
> +#define KRYO_ERR_RECORD_L1_L2 0x0
> +#define KRYO_ERR_RECORD_L3 0x1
> +
> +/* ARM DSU TRM Chapter B2.10 */
> +#define BUS_ERROR 0x12
> +
> +/* QCOM Kryo CPU part numbers */
> +#define KRYO3XX_GOLD 0x802
> +#define KRYO4XX_GOLD 0x804
> +#define KRYO4XX_SILVER_V1 0x803
> +#define KRYO4XX_SILVER_V2 0x805
> +
> +#define KRYO_EDAC_MSG_MAX 256
> +
> +static int poll_msec = 1000;
> +module_param(poll_msec, int, 0444);
> +
> +enum {
> + KRYO_L1 = 0,
> + KRYO_L2,
> + KRYO_L3,
> +};
> +
> +/* CE = Corrected Error, UE = Uncorrected Error, DE = Deferred Error */
> +enum {
No name?
> + KRYO_L1_CE = 0,
> + KRYO_L1_UE,
> + KRYO_L1_DE,
> + KRYO_L2_CE,
> + KRYO_L2_UE,
> + KRYO_L2_DE,
> + KRYO_L3_CE,
> + KRYO_L3_UE,
> + KRYO_L3_DE,
> +};
> +
> +struct error_record {
> + u32 error_code;
> + const char *error_msg;
> +};
> +
> +struct error_type {
> + void (*fn)(struct edac_device_ctl_info *edev_ctl,
> + int inst_nr, int block_nr, const char *msg);
> + const char *msg;
> +};
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.10
> + * ARM DSU TRM Chapter B2.10
> + */
> +static const struct error_record serror_record[] = {
> + { 0x1, "Errors due to fault injection" },
> + { 0x2, "ECC error from internal data buffer" },
> + { 0x6, "ECC error on cache data RAM" },
> + { 0x7, "ECC error on cache tag or dirty RAM" },
> + { 0x8, "Parity error on TLB data RAM" },
> + { 0x9, "Parity error on TLB tag RAM" },
> + { 0x12, "Error response for a cache copyback" },
> + { 0x15, "Deferred error not supported" },
> +};
> +
> +static const struct error_type err_type[] = {
> + { edac_device_handle_ce, "Kryo L1 Corrected Error" },
> + { edac_device_handle_ue, "Kryo L1 Uncorrected Error" },
> + { edac_device_handle_ue, "Kryo L1 Deferred Error" },
> + { edac_device_handle_ce, "Kryo L2 Corrected Error" },
> + { edac_device_handle_ue, "Kryo L2 Uncorrected Error" },
> + { edac_device_handle_ue, "Kryo L2 Deferred Error" },
> + { edac_device_handle_ce, "L3 Corrected Error" },
> + { edac_device_handle_ue, "L3 Uncorrected Error" },
> + { edac_device_handle_ue, "L3 Deferred Error" },
> +};
A comment is warranted to indicate that err_type is indexed by the
enum, as this would be easy to mess up in later changes.
> +
> +static struct edac_device_ctl_info __percpu *edac_dev;
> +static struct edac_device_ctl_info *drv_edev_ctl;
> +
> +static const char *get_error_msg(u64 errxstatus)
> +{
> + const struct error_record *rec;
> + u32 errxstatus_serr;
> +
> + errxstatus_serr = FIELD_GET(KRYO_ERRXSTATUS_SERR, errxstatus);
> +
> + for (rec = serror_record; rec->error_code; rec++) {
It looks like you expect the table to be zero terminated, but it's
not. Add the missing zero entry.
> + if (errxstatus_serr == rec->error_code)
> + return rec->error_msg;
> + }
> +
> + return NULL;
> +}
> +
> +static void dump_syndrome_reg(int error_type, int level,
> + u64 errxstatus, u64 errxmisc,
> + struct edac_device_ctl_info *edev_ctl)
> +{
> + char msg[KRYO_EDAC_MSG_MAX];
> + const char *error_msg;
> + int cpu;
> +
> + cpu = raw_smp_processor_id();
> +
> + error_msg = get_error_msg(errxstatus);
> + if (!error_msg)
> + return;
> +
> + snprintf(msg, KRYO_EDAC_MSG_MAX,
> + "CPU%d: %s, ERRXSTATUS_EL1:%#llx ERRXMISC0_EL1:%#llx, %s",
> + cpu, err_type[error_type].msg, errxstatus, errxmisc,
> + error_msg);
> +
> + err_type[error_type].fn(edev_ctl, 0, level, msg);
> +}
> +
> +static void kryo_check_err_type(u64 errxstatus, u64 errxmisc,
> + struct edac_device_ctl_info *edev_ctl,
> + int level)
> +{
> + u32 errxstatus_ue, errxstatus_ce, errxstatus_de;
> +
> + errxstatus_ce = FIELD_GET(KRYO_ERRXSTATUS_CE, errxstatus);
> + errxstatus_ue = FIELD_GET(KRYO_ERRXSTATUS_UE, errxstatus);
> + errxstatus_de = FIELD_GET(KRYO_ERRXSTATUS_DE, errxstatus);
> +
> + switch (level) {
> + case KRYO_L1:
> + if (errxstatus_ce)
> + dump_syndrome_reg(KRYO_L1_CE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else if (errxstatus_ue)
> + dump_syndrome_reg(KRYO_L1_UE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else if (errxstatus_de)
> + dump_syndrome_reg(KRYO_L1_DE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else
> + edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n");
> + break;
> + case KRYO_L2:
> + if (errxstatus_ce)
> + dump_syndrome_reg(KRYO_L2_CE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else if (errxstatus_ue)
> + dump_syndrome_reg(KRYO_L2_UE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else if (errxstatus_de)
> + dump_syndrome_reg(KRYO_L2_DE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else
> + edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n");
> + break;
> + case KRYO_L3:
> + if (errxstatus_ce)
> + dump_syndrome_reg(KRYO_L3_CE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else if (errxstatus_ue)
> + dump_syndrome_reg(KRYO_L3_UE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else if (errxstatus_de)
> + dump_syndrome_reg(KRYO_L3_DE, level, errxstatus,
> + errxmisc, edev_ctl);
> + else
> + edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n");
> + break;
> + default:
> + edac_printk(KERN_ERR, DRV_NAME, "Unknown level\n");
> + }
> +}
> +
> +static inline void kryo_clear_error(u64 errxstatus)
> +{
> + write_sysreg_s(errxstatus, SYS_ERXSTATUS_EL1);
> + isb();
Is the isb() necessary? If so, why not a dsb as well?
> +}
> +
> +static void kryo_parse_l1_l2_cache_error(u64 errxstatus, u64 errxmisc,
> + struct edac_device_ctl_info *edev_ctl,
> + int cpu)
> +{
> + u32 part_num = read_cpuid_part_number();
> +
> + switch (part_num) {
> + /* Kryo3XX gold CPU cores do not have a UNIT bitfield */
> + case KRYO3XX_GOLD:
> + case KRYO4XX_SILVER_V1:
> + case KRYO4XX_SILVER_V2:
> + switch (FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc)) {
> + case KRYO_L1:
> + kryo_check_err_type(errxstatus, errxmisc,
> + edev_ctl, KRYO_L1);
> + break;
> + case KRYO_L2:
> + kryo_check_err_type(errxstatus, errxmisc,
> + edev_ctl, KRYO_L2);
> + break;
> + default:
> + edac_printk(KERN_ERR, DRV_NAME,
> + "silver cpu:%d unknown error: %lu\n", cpu,
> + FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc));
> + }
> + break;
> + /* Kryo4XX gold CPU cores have a UNIT bitfield to identify levels */
> + case KRYO4XX_GOLD:
> + switch (FIELD_GET(KRYO_ERRXMISC0_UNIT, errxmisc)) {
> + case KRYO_L1_UNIT_DC:
> + case KRYO_L1_UNIT_IC:
> + kryo_check_err_type(errxstatus, errxmisc,
> + edev_ctl, KRYO_L1);
> + break;
> + case KRYO_L2_UNIT:
> + case KRYO_L2_UNIT_TLB:
> + kryo_check_err_type(errxstatus, errxmisc,
> + edev_ctl, KRYO_L2);
> + break;
> + default:
> + edac_printk(KERN_ERR, DRV_NAME,
> + "gold cpu:%d unknown error: %lu\n", cpu,
> + FIELD_GET(KRYO_ERRXMISC0_UNIT, errxmisc));
> + }
> + break;
> + default:
> + edac_printk(KERN_ERR, DRV_NAME,
> + "Error in matching cpu%d with part num:%u\n",
> + cpu, part_num);
> + }
> +}
> +
> +static inline bool kryo_check_regs_valid(u64 errxstatus)
> +{
> + /* Check if status and misc regs are valid */
> + if (!(FIELD_GET(KRYO_ERRXSTATUS_VALID, errxstatus)) ||
> + !(FIELD_GET(KRYO_ERRXSTATUS_MV, errxstatus)))
> + return false;
> +
> + return true;
> +}
> +
> +static void kryo_check_l1_l2_ecc(void *info)
> +{
> + struct edac_device_ctl_info *edev_ctl = info;
> + u64 errxstatus;
> + u64 errxmisc;
> + int cpu;
> +
> + cpu = smp_processor_id();
> + /* We know record 0 is L1 and L2 */
> + write_sysreg_s(0, SYS_ERRSELR_EL1);
> + isb();
Another isb I'm not sure about. Is this meant to provide a barrier
between ERRSELR and ERXSTATUS? Wouldn't that be dsb, not isb?
> +
> + errxstatus = read_sysreg_s(SYS_ERXSTATUS_EL1);
> + if (!kryo_check_regs_valid(errxstatus))
> + return;
> +
> + errxmisc = read_sysreg_s(SYS_ERXMISC0_EL1);
> + /* Check if L1/L2 error */
> + if (!(FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc) == KRYO_L1) &&
> + !(FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc) == KRYO_L2))
> + return;
> +
> + kryo_parse_l1_l2_cache_error(errxstatus, errxmisc, edev_ctl, cpu);
> + kryo_clear_error(errxstatus);
> +}
> +