On Thu, Dec 05, 2019 at 09:53:18AM +0000, Sai Prakash Ranjan wrote:
Kryo{3,4}XX CPU cores implement RAS extensions to support
Error Correcting Code(ECC). Currently all Kryo{3,4}XX CPU
cores (gold/silver a.k.a big/LITTLE) support ECC via RAS.
via RAS what? ARM64_RAS_EXTN?
In any case, this needs James to look at and especially if there's some
ARM-generic functionality in there which should be shared, of course.
This adds an interrupt based driver for those CPUs and
s/This adds/Add/
+
+config EDAC_QCOM_KRYO_POLL
+ depends on EDAC_QCOM_KRYO
+ bool "Poll on Kryo ECC registers"
+ help
+ This option chooses whether or not you want to poll on the Kryo ECC
+ registers. When this is enabled, the polling rate can be set as a
+ module parameter. By default, it will call the polling function every
+ second.
Why is this a separate option and why should people use that?
Can the polling/irq be switched automatically?
+
config EDAC_ASPEED
tristate "Aspeed AST 2500 SoC"
depends on MACH_ASPEED_G5
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index d77200c9680b..29edcfa6ec0e 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -85,5 +85,6 @@ obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o
obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o
obj-$(CONFIG_EDAC_TI) += ti_edac.o
obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
+obj-$(CONFIG_EDAC_QCOM_KRYO) += qcom_kryo_edac.o
What is the difference between this new driver and the qcom_edac one? Can
functionality be shared?
Should this new one be called simply kryo_edac instead?
+
+#define DRV_NAME "qcom_kryo_edac"
+
+/*
+ * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.3
Chapter? Where? URL?
+
+static const struct error_type err_type[] = {
+ { edac_device_handle_ce, "Kryo L1 Corrected Error" },
+ { edac_device_handle_ue, "Kryo L1 Uncorrected Error" },
+ { edac_device_handle_ue, "Kryo L1 Deferred Error" },
+ { edac_device_handle_ce, "Kryo L2 Corrected Error" },
+ { edac_device_handle_ue, "Kryo L2 Uncorrected Error" },
+ { edac_device_handle_ue, "Kryo L2 Deferred Error" },
+ { edac_device_handle_ce, "L3 Corrected Error" },
+ { edac_device_handle_ue, "L3 Uncorrected Error" },
+ { edac_device_handle_ue, "L3 Deferred Error" },
+};
+
All that is not really needed - you can put the whole error type
detection and dumping in kryo_check_err_type() in nicely readable
switch-case statement. No need for the function pointers and special
structs.
+static struct edac_device_ctl_info __percpu *edac_dev;
+static struct edac_device_ctl_info *drv_edev_ctl;
+
+static const char *get_error_msg(u64 errxstatus)
+{
+ const struct error_record *rec;
+ u32 errxstatus_serr;
+
+ errxstatus_serr = FIELD_GET(KRYO_ERRXSTATUS_SERR, errxstatus);
+
+ for (rec = serror_record; rec->error_code; rec++) {
+ if (errxstatus_serr == rec->error_code)
+ return rec->error_msg;
+ }
+
+ return NULL;
+}
+
+static void dump_syndrome_reg(int error_type, int level,
+ u64 errxstatus, u64 errxmisc,
+ struct edac_device_ctl_info *edev_ctl)
+{
+ char msg[KRYO_EDAC_MSG_MAX];
+ const char *error_msg;
+ int cpu;
+
+ cpu = raw_smp_processor_id();
Why raw_?
+static int kryo_l1_l2_setup_irq(struct platform_device *pdev,
+ struct edac_device_ctl_info *edev_ctl)
+{
+ int cpu, errirq, faultirq, ret;
+
+ edac_dev = devm_alloc_percpu(&pdev->dev, *edac_dev);
+ if (!edac_dev)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ per_cpu(edac_dev, cpu) = edev_ctl;
+ preempt_enable();
+ }
That sillyness doesn't belong here, if at all.
...
+static void kryo_poll_cache_error(struct edac_device_ctl_info *edev_ctl)
+{
+ if (!edev_ctl)
+ edev_ctl = drv_edev_ctl;
That's silly.
+
+ on_each_cpu(kryo_check_l1_l2_ecc, edev_ctl, 1);
+ kryo_check_l3_scu_ecc(edev_ctl);
+}
...
+static int qcom_kryo_edac_probe(struct platform_device *pdev)
+{
+ struct edac_device_ctl_info *edev_ctl;
+ struct device *dev = &pdev->dev;
+ int ret;
+
+ qcom_kryo_edac_setup();
This function needs to have a return value saying whether it did setup
the hw properly or not and the probe function needs to return here if
not.