[PATCH V4 09/10] accel/amdxdna: Add error handling

From: Lizhi Hou
Date: Fri Oct 11 2024 - 19:14:31 EST


When there is a hardware error, the NPU firmware notifies the host through
a mailbox message. The message includes details of the error, such as the
tile and column indexes where the error occurred.

The driver starts a thread to handle the NPU error message. The thread
stops the clients which are using the column where error occurred. Then
the driver resets that column.

Co-developed-by: Min Ma <min.ma@xxxxxxx>
Signed-off-by: Min Ma <min.ma@xxxxxxx>
Signed-off-by: Lizhi Hou <lizhi.hou@xxxxxxx>
---
drivers/accel/amdxdna/Makefile | 1 +
drivers/accel/amdxdna/aie2_error.c | 356 +++++++++++++++++++++++++++
drivers/accel/amdxdna/aie2_message.c | 19 ++
drivers/accel/amdxdna/aie2_pci.c | 32 +++
drivers/accel/amdxdna/aie2_pci.h | 9 +
5 files changed, 417 insertions(+)
create mode 100644 drivers/accel/amdxdna/aie2_error.c

diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile
index a688c378761f..ed6f87910880 100644
--- a/drivers/accel/amdxdna/Makefile
+++ b/drivers/accel/amdxdna/Makefile
@@ -2,6 +2,7 @@

amdxdna-y := \
aie2_ctx.o \
+ aie2_error.o \
aie2_message.o \
aie2_pci.o \
aie2_psp.o \
diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
new file mode 100644
index 000000000000..d2787549f3b7
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/dma-mapping.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+struct async_event {
+ struct amdxdna_dev_hdl *ndev;
+ struct async_event_msg_resp resp;
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ u8 *buf;
+ dma_addr_t addr;
+ u32 size;
+};
+
+struct async_events {
+ struct workqueue_struct *wq;
+ u8 *buf;
+ dma_addr_t addr;
+ u32 size;
+ u32 event_cnt;
+ struct async_event event[] __counted_by(event_cnt);
+};
+
+/*
+ * Below enum, struct and lookup tables are porting from XAIE util header file.
+ *
+ * Below data is defined by AIE device and it is used for decode error message
+ * from the device.
+ */
+
+enum aie_module_type {
+ AIE_MEM_MOD = 0,
+ AIE_CORE_MOD,
+ AIE_PL_MOD,
+};
+
+enum aie_error_category {
+ AIE_ERROR_SATURATION = 0,
+ AIE_ERROR_FP,
+ AIE_ERROR_STREAM,
+ AIE_ERROR_ACCESS,
+ AIE_ERROR_BUS,
+ AIE_ERROR_INSTRUCTION,
+ AIE_ERROR_ECC,
+ AIE_ERROR_LOCK,
+ AIE_ERROR_DMA,
+ AIE_ERROR_MEM_PARITY,
+ /* Unknown is not from XAIE, added for better category */
+ AIE_ERROR_UNKNOWN,
+};
+
+/* Don't pack, unless XAIE side changed */
+struct aie_error {
+ u8 row;
+ u8 col;
+ u32 mod_type;
+ u8 event_id;
+};
+
+struct aie_err_info {
+ u32 err_cnt;
+ u32 ret_code;
+ u32 rsvd;
+ struct aie_error payload[] __counted_by(err_cnt);
+};
+
+struct aie_event_category {
+ u8 event_id;
+ enum aie_error_category category;
+};
+
+#define EVENT_CATEGORY(id, cat) { id, cat }
+static const struct aie_event_category aie_ml_mem_event_cat[] = {
+ EVENT_CATEGORY(88U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(90U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(97U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(98U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(99U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(100U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_core_event_cat[] = {
+ EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(58U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
+ EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(62U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(64U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
+ EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
+ EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(72U, AIE_ERROR_BUS),
+};
+
+static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
+ EVENT_CATEGORY(130U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(132U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(133U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(134U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(138U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
+ EVENT_CATEGORY(64U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(67U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(68U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(69U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(70U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(71U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(72U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(73U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
+};
+
+static enum aie_error_category
+aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
+{
+ const struct aie_event_category *lut;
+ int num_entry;
+ int i;
+
+ switch (mod_type) {
+ case AIE_PL_MOD:
+ lut = aie_ml_shim_tile_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
+ break;
+ case AIE_CORE_MOD:
+ lut = aie_ml_core_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
+ break;
+ case AIE_MEM_MOD:
+ if (row == 1) {
+ lut = aie_ml_mem_tile_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
+ } else {
+ lut = aie_ml_mem_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
+ }
+ break;
+ default:
+ return AIE_ERROR_UNKNOWN;
+ }
+
+ for (i = 0; i < num_entry; i++) {
+ if (event_id != lut[i].event_id)
+ continue;
+
+ return lut[i].category;
+ }
+
+ return AIE_ERROR_UNKNOWN;
+}
+
+static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
+{
+ struct aie_error *errs = err_info;
+ u32 err_col = 0; /* assume that AIE has less than 32 columns */
+ int i;
+
+ /* Get err column bitmap */
+ for (i = 0; i < num_err; i++) {
+ struct aie_error *err = &errs[i];
+ enum aie_error_category cat;
+
+ cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
+ XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
+ err->row, err->col, err->mod_type,
+ err->event_id, cat);
+
+ if (err->col >= 32) {
+ XDNA_WARN(ndev->xdna, "Invalid column number");
+ break;
+ }
+
+ err_col |= (1 << err->col);
+ }
+
+ return err_col;
+}
+
+static int aie2_error_async_cb(void *handle, const u32 *data, size_t size)
+{
+ struct async_event_msg_resp *resp;
+ struct async_event *e = handle;
+
+ if (data) {
+ resp = (struct async_event_msg_resp *)data;
+ e->resp.type = resp->type;
+ wmb(); /* Update status in the end, so that no lock for here */
+ e->resp.status = resp->status;
+ }
+ queue_work(e->wq, &e->work);
+ return 0;
+}
+
+static int aie2_error_event_send(struct async_event *e)
+{
+ drm_clflush_virt_range(e->buf, e->size); /* device can access */
+ return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
+ aie2_error_async_cb);
+}
+
+static void aie2_error_worker(struct work_struct *err_work)
+{
+ struct aie_err_info *info;
+ struct amdxdna_dev *xdna;
+ struct async_event *e;
+ u32 max_err;
+ u32 err_col;
+
+ e = container_of(err_work, struct async_event, work);
+
+ xdna = e->ndev->xdna;
+
+ if (e->resp.status == MAX_AIE2_STATUS_CODE)
+ return;
+
+ e->resp.status = MAX_AIE2_STATUS_CODE;
+
+ print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
+ e->buf, 0x100, false);
+
+ info = (struct aie_err_info *)e->buf;
+ XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
+
+ max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
+ if (unlikely(info->err_cnt > max_err)) {
+ WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
+ return;
+ }
+ err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
+ if (!err_col) {
+ XDNA_WARN(xdna, "Did not get error column");
+ return;
+ }
+
+ mutex_lock(&xdna->dev_lock);
+ /* Re-sent this event to firmware */
+ if (aie2_error_event_send(e))
+ XDNA_WARN(xdna, "Unable to register async event");
+ mutex_unlock(&xdna->dev_lock);
+}
+
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
+{
+ struct amdxdna_dev *xdna = ndev->xdna;
+ struct async_event *e;
+ int i, ret;
+
+ drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+ for (i = 0; i < ndev->async_events->event_cnt; i++) {
+ e = &ndev->async_events->event[i];
+ ret = aie2_error_event_send(e);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
+{
+ struct amdxdna_dev *xdna = ndev->xdna;
+ struct async_events *events;
+
+ events = ndev->async_events;
+ destroy_workqueue(events->wq);
+ dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+ events->addr, DMA_FROM_DEVICE);
+ kfree(events);
+}
+
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
+{
+ struct amdxdna_dev *xdna = ndev->xdna;
+ u32 total_col = ndev->total_col;
+ u32 total_size = ASYNC_BUF_SIZE * total_col;
+ struct async_events *events;
+ int i, ret;
+
+ events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
+ if (!events)
+ return -ENOMEM;
+
+ events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
+ DMA_FROM_DEVICE, GFP_KERNEL);
+ if (!events->buf) {
+ ret = -ENOMEM;
+ goto free_events;
+ }
+ events->size = total_size;
+ events->event_cnt = total_col;
+
+ events->wq = alloc_ordered_workqueue("async_wq", 0);
+ if (!events->wq) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+
+ for (i = 0; i < events->event_cnt; i++) {
+ struct async_event *e = &events->event[i];
+ u32 offset = i * ASYNC_BUF_SIZE;
+
+ e->ndev = ndev;
+ e->wq = events->wq;
+ e->buf = &events->buf[offset];
+ e->addr = events->addr + offset;
+ e->size = ASYNC_BUF_SIZE;
+ e->resp.status = MAX_AIE2_STATUS_CODE;
+ INIT_WORK(&e->work, aie2_error_worker);
+ }
+
+ ndev->async_events = events;
+
+ XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
+ events->event_cnt, events->size);
+ return 0;
+
+free_buf:
+ dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+ events->addr, DMA_FROM_DEVICE);
+free_events:
+ kfree(events);
+ return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 3dc4a9a8571e..6e65b611b691 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -307,6 +307,25 @@ int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u6
return 0;
}

+int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
+ void *handle, int (*cb)(void*, const u32 *, size_t))
+{
+ struct async_event_msg_req req = { 0 };
+ struct xdna_mailbox_msg msg = {
+ .send_data = (u8 *)&req,
+ .send_size = sizeof(req),
+ .handle = handle,
+ .opcode = MSG_OP_REGISTER_ASYNC_EVENT_MSG,
+ .notify_cb = cb,
+ };
+
+ req.buf_addr = addr;
+ req.buf_size = size;
+
+ XDNA_DBG(ndev->xdna, "Register addr 0x%llx size 0x%x", addr, size);
+ return xdna_mailbox_send_msg(ndev->mgmt_chann, &msg, TX_TIMEOUT);
+}
+
int aie2_config_cu(struct amdxdna_hwctx *hwctx)
{
struct mailbox_channel *chann = hwctx->priv->mbox_chann;
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 7a1729bed62d..ff872f2389f7 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -180,6 +180,15 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
return ret;
}

+ if (!ndev->async_events)
+ return 0;
+
+ ret = aie2_error_async_events_send(ndev);
+ if (ret) {
+ XDNA_ERR(ndev->xdna, "Send async events failed");
+ return ret;
+ }
+
return 0;
}

@@ -472,9 +481,30 @@ static int aie2_init(struct amdxdna_dev *xdna)
goto stop_hw;
}

+ ret = aie2_error_async_events_alloc(ndev);
+ if (ret) {
+ XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
+ goto stop_hw;
+ }
+
+ ret = aie2_error_async_events_send(ndev);
+ if (ret) {
+ XDNA_ERR(xdna, "Send async events failed, ret %d", ret);
+ goto async_event_free;
+ }
+
+ /* Issue a command to make sure firmware handled async events */
+ ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
+ if (ret) {
+ XDNA_ERR(xdna, "Re-query firmware version failed");
+ goto async_event_free;
+ }
+
release_firmware(fw);
return 0;

+async_event_free:
+ aie2_error_async_events_free(ndev);
stop_hw:
aie2_hw_stop(xdna);
disable_sva:
@@ -490,8 +520,10 @@ static int aie2_init(struct amdxdna_dev *xdna)
static void aie2_fini(struct amdxdna_dev *xdna)
{
struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+ struct amdxdna_dev_hdl *ndev = xdna->dev_handle;

aie2_hw_stop(xdna);
+ aie2_error_async_events_free(ndev);
iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
pci_free_irq_vectors(pdev);
}
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index bbcc3d85e13c..9634a7588650 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -164,6 +164,7 @@ struct amdxdna_dev_hdl {
/* Mailbox and the management channel */
struct mailbox *mbox;
struct mailbox_channel *mgmt_chann;
+ struct async_events *async_events;
};

#define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
@@ -204,6 +205,12 @@ struct psp_device *aie2m_psp_create(struct drm_device *ddev, struct psp_config *
int aie2_psp_start(struct psp_device *psp);
void aie2_psp_stop(struct psp_device *psp);

+/* aie2_error.c */
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
+int aie2_error_async_msg_thread(void *data);
+
/* aie2_message.c */
int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
@@ -218,6 +225,8 @@ int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
+int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
+ void *handle, int (*cb)(void*, const u32 *, size_t));
int aie2_config_cu(struct amdxdna_hwctx *hwctx);
int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
int (*notify_cb)(void *, const u32 *, size_t));
--
2.34.1