Hi Talel,Unfortunately, we don't have public documentation for it.
On 15/09/2019 07:43, Talel Shenhar wrote:
The Amazon's Annapurna Labs Memory Controller EDAC supports ECC capabilityIs there any documentation for this memory controller?
for error detection and correction (Single bit error correction, Double
detection). This driver introduces EDAC driver for that capability.
diff --git a/drivers/edac/al_mc_edac.c b/drivers/edac/al_mc_edac.c#include <linux/bitops.h> for hweight_long()
new file mode 100644
index 0000000..f9763d4
--- /dev/null
+++ b/drivers/edac/al_mc_edac.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ */
+#include <linux/bitfield.h>
shall be part of v3.
+#include <linux/edac.h>#include <linux/platform_device.h> for platform_get_resource()
+#include <linux/of_irq.h>
+#include "edac_module.h"Is this a fixed property of the memory controller, or is it a limit imposed from somewhere
+/* Registers Values */
+#define AL_MC_MSTR_DEV_CFG_X4 0
+#define AL_MC_MSTR_DEV_CFG_X8 1
+#define AL_MC_MSTR_DEV_CFG_X16 2
+#define AL_MC_MSTR_DEV_CFG_X32 3
+#define AL_MC_MSTR_RANKS_MAX 4
else. (Does it need to come from the DT?)
Shall be part of v3
+#define AL_MC_MSTR_DATA_BUS_WIDTH_X64 0(Some of these could go on the same line, same with UE below)
+
+#define DRV_NAME "al_mc_edac"
+#define AL_MC_EDAC_MSG_MAX 256
+#define AL_MC_EDAC_MSG(message, buffer_size, type, \
+ rank, row, bg, bank, column, syn0, syn1, syn2) \
+ snprintf(message, buffer_size, \
+ "%s rank=0x%x row=0x%x bg=0x%x bank=0x%x col=0x%x " \
+ "syn0: 0x%x syn1: 0x%x syn2: 0x%x", \
+ type == HW_EVENT_ERR_UNCORRECTED ? "UE" : "CE", \
+ rank, row, bg, bank, column, syn0, syn1, syn2)
+
+struct al_mc_edac {
+ void __iomem *mmio_base;
+ int irq_ce;
+ int irq_ue;
+};
+
+static int al_mc_edac_handle_ce(struct mem_ctl_info *mci)
+{
+ struct al_mc_edac *al_mc = mci->pvt_info;
+ u32 eccerrcnt;
+ u16 ce_count;
+ u32 ecccaddr0;
+ u32 ecccaddr1;
+ u32 ecccsyn0;
+ u32 ecccsyn1;
+ u32 ecccsyn2;
+ u8 rank;
+ u32 row;
+ u8 bg;
+ u8 bank;
+ u16 column;
+ char msg[AL_MC_EDAC_MSG_MAX];
+
+ edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+ ce_count, 0, 0, 0, 0, 0, -1, mci->ctl_name, msg);
You used active_ranks as the layer size in al_mc_edac_probe(). Can't you supply the rank here?
(If its not useful, why is it setup like this in al_mc_edac_probe()?)
+ u8 bank;
+ u16 column;
+ char msg[AL_MC_EDAC_MSG_MAX];
+
+ eccerrcnt = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_ERR_COUNT);
+ ue_count = FIELD_GET(AL_MC_ECC_ERR_COUNT_UE, eccerrcnt);
+ if (!ue_count)
+ return 0;
+
+ eccuaddr0 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_ADDR0);
+ eccuaddr1 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_ADDR1);
+ eccusyn0 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_SYND0);
+ eccusyn1 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_SYND1);
+ eccusyn2 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_SYND2);
+
+ writel(AL_MC_ECC_CLEAR_UE_COUNT | AL_MC_ECC_CLEAR_UE_ERR,
+ al_mc->mmio_base + AL_MC_ECC_CLEAR);
+
+ dev_dbg(mci->pdev, "eccuaddr0=0x%08x eccuaddr1=0x%08x\n",
+ eccuaddr0, eccuaddr1);
+
+ rank = FIELD_GET(AL_MC_ECC_UE_ADDR0_RANK, eccuaddr0);
+ row = FIELD_GET(AL_MC_ECC_UE_ADDR0_ROW, eccuaddr0);
+
+ bg = FIELD_GET(AL_MC_ECC_UE_ADDR1_BG, eccuaddr1);
+ bank = FIELD_GET(AL_MC_ECC_UE_ADDR1_BANK, eccuaddr1);
+ column = FIELD_GET(AL_MC_ECC_UE_ADDR1_COLUMN, eccuaddr1);
+
+ AL_MC_EDAC_MSG(msg, sizeof(msg), HW_EVENT_ERR_UNCORRECTED,
+ rank, row, bg, bank, column,
+ eccusyn0, eccusyn1, eccusyn2);
+
+ edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+ ue_count, 0, 0, 0, 0, 0, -1, mci->ctl_name, msg);
What happens when this code runs at the same time as the corrected error handler calling
edac_mc_handler_error() with this same mci?
This could happen on a second CPU, or on one cpu if the corrected handler is polled.
edac_mc_handle_error() memset's the edac_raw_error_desc in mci, so it can't be called in
parallel, or twice on the same cpu.
I think you need an irqsave spinlock around the calls to edac_mc_handle_error().
ack, shall add to v3
+As you don't use ue_count, wouldn't this be clearer:
+static irqreturn_t al_mc_edac_irq_handler_ue(int irq, void *info)
+{
+ struct platform_device *pdev = info;
+ struct mem_ctl_info *mci = platform_get_drvdata(pdev);
+ int ue_count;
+
+ ue_count = al_mc_edac_handle_ue(mci);
+ if (ue_count)
+ return IRQ_HANDLED;
+ else
+ return IRQ_NONE;
+}
| if (al_mc_edac_handle_ue(mci))
| return IRQ_HANDLED;
| return IRQ_NONE;
?
ack, shall add to v3
+static int al_mc_edac_probe(struct platform_device *pdev)platform_get_resource() can fail, returning NULL.
+{
+ struct resource *resource;
+ void __iomem *mmio_base;
+ unsigned int active_ranks;
+ struct edac_mc_layer layers[1];
+ struct mem_ctl_info *mci;
+ struct al_mc_edac *al_mc;
+ int ret;
+
+ resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
ack, shall add to v3
+
+ if (al_mc->irq_ue > 0) {
+ ret = devm_request_irq(&pdev->dev,
+ al_mc->irq_ue,
+ al_mc_edac_irq_handler_ue,
+ 0,As you know when your device has triggered the interrupt from the error counter, could
these be IRQF_SHARED?
ack, shall add to v3
+static int al_mc_edac_remove(struct platform_device *pdev)What stops your interrupt firing here? You've free'd the memory it uses.
+{
+ struct mem_ctl_info *mci = platform_get_drvdata(pdev);
+
+ edac_mc_del_mc(&pdev->dev);
+ edac_mc_free(mci);
I think you need to devm_free_irq() the interrupts before you free the memory.
+MODULE_LICENSE("GPL v2");(Kconfig says this is 'bool', so it can't be built as a module, having these is a bit odd)
+MODULE_AUTHOR("Talel Shenhar");
+MODULE_DESCRIPTION("Amazon's Annapurna Lab's Memory Controller EDAC Driver");
Thanks,
James