Re: [PATCH v4 2/2] soc: amazon: al-pos-edac: Introduce Amazon's Annapurna Labs POS EDAC driver

From: Marc Zyngier
Date: Mon Oct 07 2019 - 07:26:37 EST


On Thu, 03 Oct 2019 12:32:41 +0100,
Talel Shenhar <talel@xxxxxxxxxx> wrote:
>
> The Amazon's Annapurna Labs SoCs includes Point Of Serialization error
> logging unit that reports an error in case write error (e.g . Attempt to
> write to a read only register).
> This error shall be reported to EDAC subsystem as uncorrectable-error.
>
> Signed-off-by: Talel Shenhar <talel@xxxxxxxxxx>
> ---
> MAINTAINERS | 7 ++
> drivers/edac/Kconfig | 6 ++
> drivers/edac/Makefile | 1 +
> drivers/edac/al_pos_edac.c | 173 +++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 187 insertions(+)
> create mode 100644 drivers/edac/al_pos_edac.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e7a47b5..f5ce446 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -751,6 +751,13 @@ F: drivers/tty/serial/altera_jtaguart.c
> F: include/linux/altera_uart.h
> F: include/linux/altera_jtaguart.h
>
> +AMAZON ANNAPURNA LABS POS EDAC DRIVER
> +M: Talel Shenhar <talel@xxxxxxxxxx>
> +M: Talel Shenhar <talelshenhar@xxxxxxxxx>
> +S: Maintained
> +F: Documentation/devicetree/bindings/edac/amazon,al-pos-edac.yaml
> +F: drivers/edac/al-pos-edac.c
> +
> AMAZON ANNAPURNA LABS THERMAL MMIO DRIVER
> M: Talel Shenhar <talel@xxxxxxxxxx>
> S: Maintained
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 200c04c..bb5805f 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -100,6 +100,12 @@ config EDAC_AMD64_ERROR_INJECTION
> In addition, there are two control files, inject_read and inject_write,
> which trigger the DRAM ECC Read and Write respectively.
>
> +config EDAC_AL_POS
> + tristate "Amazon's Annapurna Labs POS EDAC driver"
> + depends on (ARCH_ALPINE || COMPILE_TEST)
> + help
> + Include support for the SoC POS EDAC error capability.
> +
> config EDAC_AMD76X
> tristate "AMD 76x (760, 762, 768)"
> depends on PCI && X86_32
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index 165ca65e..3571936 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -22,6 +22,7 @@ obj-$(CONFIG_EDAC_GHES) += ghes_edac.o
> edac_mce_amd-y := mce_amd.o
> obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o
>
> +obj-$(CONFIG_EDAC_AL_POS) += al_pos_edac.o
> obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o
> obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o
> obj-$(CONFIG_EDAC_I5000) += i5000_edac.o
> diff --git a/drivers/edac/al_pos_edac.c b/drivers/edac/al_pos_edac.c
> new file mode 100644
> index 00000000..bd6cd87
> --- /dev/null
> +++ b/drivers/edac/al_pos_edac.c
> @@ -0,0 +1,173 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> + */
> +#include <linux/bitfield.h>
> +#include <linux/edac.h>
> +#include <linux/of_irq.h>
> +#include "edac_module.h"
> +
> +#define DRV_NAME "al_pos_edac"
> +#define AL_POS_EDAC_MSG_MAX 256
> +
> +/* Registers Offset */
> +#define AL_POS_ERROR_LOG_1 0x0
> +#define AL_POS_ERROR_LOG_0 0x4
> +
> +/* Registers Fields */
> +#define AL_POS_ERROR_LOG_1_VALID BIT(31)
> +#define AL_POS_ERROR_LOG_1_BRESP GENMASK(18, 17)
> +#define AL_POS_ERROR_LOG_1_REQUEST_ID GENMASK(16, 8)
> +#define AL_POS_ERROR_LOG_1_ADDR_HIGH GENMASK(7, 0)
> +
> +#define AL_POS_ERROR_LOG_0_ADDR_LOW GENMASK(31, 0)
> +
> +struct al_pos_edac {
> + struct edac_device_ctl_info *edac_dev;
> + void __iomem *mmio_base;
> + int irq;
> +};
> +
> +static int al_pos_handle(struct al_pos_edac *al_pos)
> +{
> + u32 log0, log1;
> + u64 addr;
> + u16 request_id;
> + u8 bresp;
> + char msg[AL_POS_EDAC_MSG_MAX];
> +
> + log1 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_1);

I already commented on the misuse of strict accesses. Unless you can
explain and document *why* you need the extra ordering, please use
relaxed accesses.

> + if (!FIELD_GET(AL_POS_ERROR_LOG_1_VALID, log1))
> + return 0;
> +
> + log0 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_0);
> + writel(0, al_pos->mmio_base + AL_POS_ERROR_LOG_1);
> +
> + addr = FIELD_GET(AL_POS_ERROR_LOG_0_ADDR_LOW, log0);
> + addr |= (((u64)FIELD_GET(AL_POS_ERROR_LOG_1_ADDR_HIGH, log1)) << 32);
> + request_id = FIELD_GET(AL_POS_ERROR_LOG_1_REQUEST_ID, log1);
> + bresp = FIELD_GET(AL_POS_ERROR_LOG_1_BRESP, log1);
> +
> + snprintf(msg, sizeof(msg),
> + "addr=0x%llx request_id=0x%x bresp=0x%x\n",
> + addr, request_id, bresp);
> +
> + edac_device_handle_ue(al_pos->edac_dev, 0, 0, msg);
> +
> + return 1;
> +}
> +
> +static void al_pos_edac_check(struct edac_device_ctl_info *edac_dev)
> +{
> + struct al_pos_edac *al_pos = edac_dev->pvt_info;
> +
> + al_pos_handle(al_pos);
> +}
> +
> +static irqreturn_t al_pos_irq_handler(int irq, void *info)
> +{
> + struct platform_device *pdev = info;
> + struct al_pos_edac *al_pos = platform_get_drvdata(pdev);
> +
> + if (al_pos_handle(al_pos))
> + return IRQ_HANDLED;
> + return IRQ_NONE;
> +}
> +
> +static int al_pos_probe(struct platform_device *pdev)
> +{
> + struct edac_device_ctl_info *edac_dev;
> + struct al_pos_edac *al_pos;
> + int ret;
> +
> + edac_dev = edac_device_alloc_ctl_info(sizeof(*al_pos), DRV_NAME, 1,
> + DRV_NAME, 1, 0, NULL, 0,
> + edac_device_alloc_index());
> + if (!edac_dev)
> + return -ENOMEM;
> +
> + al_pos = edac_dev->pvt_info;
> + al_pos->edac_dev = edac_dev;
> + platform_set_drvdata(pdev, al_pos);
> +
> + al_pos->mmio_base = devm_platform_ioremap_resource(pdev, 0);
> + if (IS_ERR(al_pos->mmio_base)) {
> + dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n",
> + PTR_ERR(al_pos->mmio_base));
> + return PTR_ERR(al_pos->mmio_base);
> + }
> +
> + al_pos->irq = platform_get_irq(pdev, 0);
> + if (al_pos->irq <= 0)
> + edac_dev->edac_check = al_pos_edac_check;
> +
> + edac_dev->dev = &pdev->dev;
> + edac_dev->mod_name = DRV_NAME;
> + edac_dev->dev_name = dev_name(&pdev->dev);
> + edac_dev->ctl_name = "POS";
> +
> + ret = edac_device_add_device(edac_dev);
> + if (ret) {
> + dev_err(&pdev->dev, "Failed to add edac device\n");
> + goto err_free_edac;
> + }
> +
> + if (al_pos->irq > 0) {
> + ret = devm_request_irq(&pdev->dev,
> + al_pos->irq,
> + al_pos_irq_handler,
> + 0,
> + pdev->name,
> + pdev);
> + if (ret != 0) {
> + dev_err(&pdev->dev,
> + "failed to register to irq %d (%d)\n",
> + al_pos->irq, ret);
> + goto err_remove_edac;

Would it be worth continuing without interrupts? After all, the
interrupt seems to be an optional part of the device...

Thanks,

M.

--
Jazz is not dead, it just smells funny.