[PATCH v2 2/3] habanalabs: Add debugfs node for engines status

From: Tomer Tayar
Date: Mon Jul 01 2019 - 09:59:54 EST


Command submissions sent to the device are composed of command buffers
which are targeted to different device engines, like DMA and compute
entities. When a command submission gets stuck, knowing in which engine
the stuck is, is crucial for debugging.
This patch adds a debugfs node that exports this information, by
displaying the engines' various registers that assemble their idle/busy
status.
The information retrieval is based on the is_device_idle ASIC function.
The printout in this function, of the first detected busy engine, is
removed because it becomes redundant in the presence of the more
elaborated info of the new debugfs node.

Signed-off-by: Tomer Tayar <ttayar@xxxxxxxxx>
---
Changes in v2:
- Move the idle check updates into a separate patch.
- Update Documentation/.../debugfs-driver-habanalabs with the new
debugfs node.

.../ABI/testing/debugfs-driver-habanalabs | 7 +
drivers/misc/habanalabs/debugfs.c | 12 ++
drivers/misc/habanalabs/goya/goya.c | 129 ++++++++++++------
drivers/misc/habanalabs/habanalabs.h | 8 +-
drivers/misc/habanalabs/habanalabs_ioctl.c | 2 +-
5 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index 18191c2becab..f0ac14b70ecb 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -51,6 +51,13 @@ Description: Enables the root user to set the device to specific state.
Valid values are "disable", "enable", "suspend", "resume".
User can read this property to see the valid values

+What: /sys/kernel/debug/habanalabs/hl<n>/engines
+Date: Jul 2019
+KernelVersion: 5.3
+Contact: oded.gabbay@xxxxxxxxx
+Description: Displays the status registers values of the device engines and
+ their derived idle status
+
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
Date: Jan 2019
KernelVersion: 5.1
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 17974919b760..6a5dfb14eca1 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -500,6 +500,17 @@ static ssize_t mmu_write(struct file *file, const char __user *buf,
return -EINVAL;
}

+static int engines_show(struct seq_file *s, void *data)
+{
+ struct hl_debugfs_entry *entry = s->private;
+ struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+ struct hl_device *hdev = dev_entry->hdev;
+
+ hdev->asic_funcs->is_device_idle(hdev, s);
+
+ return 0;
+}
+
static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -893,6 +904,7 @@ static const struct hl_info_list hl_debugfs_list[] = {
{"userptr", userptr_show, NULL},
{"vm", vm_show, NULL},
{"mmu", mmu_show, mmu_write},
+ {"engines", engines_show, NULL}
};

static int hl_debugfs_open(struct inode *inode, struct file *file)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 8653aa914724..41e97531f300 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -15,6 +15,7 @@
#include <linux/hwmon.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/iommu.h>
+#include <linux/seq_file.h>

/*
* GOYA security scheme:
@@ -90,6 +91,30 @@
#define GOYA_CB_POOL_CB_CNT 512
#define GOYA_CB_POOL_CB_SIZE 0x20000 /* 128KB */

+#define IS_QM_IDLE(engine, qm_glbl_sts0) \
+ (((qm_glbl_sts0) & engine##_QM_IDLE_MASK) == engine##_QM_IDLE_MASK)
+#define IS_DMA_QM_IDLE(qm_glbl_sts0) IS_QM_IDLE(DMA, qm_glbl_sts0)
+#define IS_TPC_QM_IDLE(qm_glbl_sts0) IS_QM_IDLE(TPC, qm_glbl_sts0)
+#define IS_MME_QM_IDLE(qm_glbl_sts0) IS_QM_IDLE(MME, qm_glbl_sts0)
+
+#define IS_CMDQ_IDLE(engine, cmdq_glbl_sts0) \
+ (((cmdq_glbl_sts0) & engine##_CMDQ_IDLE_MASK) == \
+ engine##_CMDQ_IDLE_MASK)
+#define IS_TPC_CMDQ_IDLE(cmdq_glbl_sts0) \
+ IS_CMDQ_IDLE(TPC, cmdq_glbl_sts0)
+#define IS_MME_CMDQ_IDLE(cmdq_glbl_sts0) \
+ IS_CMDQ_IDLE(MME, cmdq_glbl_sts0)
+
+#define IS_DMA_IDLE(dma_core_sts0) \
+ !((dma_core_sts0) & DMA_CH_0_STS0_DMA_BUSY_MASK)
+
+#define IS_TPC_IDLE(tpc_cfg_sts) \
+ (((tpc_cfg_sts) & TPC_CFG_IDLE_MASK) == TPC_CFG_IDLE_MASK)
+
+#define IS_MME_IDLE(mme_arch_sts) \
+ (((mme_arch_sts) & MME_ARCH_IDLE_MASK) == MME_ARCH_IDLE_MASK)
+
+
static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = {
"goya cq 0", "goya cq 1", "goya cq 2", "goya cq 3",
"goya cq 4", "goya cpu eq"
@@ -2796,7 +2821,6 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
dma_addr_t fence_dma_addr;
struct hl_cb *cb;
u32 tmp, timeout;
- char buf[16] = {};
int rc;

if (hdev->pldm)
@@ -2804,10 +2828,9 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
else
timeout = HL_DEVICE_TIMEOUT_USEC;

- if (!hdev->asic_funcs->is_device_idle(hdev, buf, sizeof(buf))) {
+ if (!hdev->asic_funcs->is_device_idle(hdev, NULL)) {
dev_err_ratelimited(hdev->dev,
- "Can't send KMD job on QMAN0 because %s is busy\n",
- buf);
+ "Can't send KMD job on QMAN0 because the device is not idle\n");
return -EBUSY;
}

@@ -4891,59 +4914,75 @@ int goya_armcp_info_get(struct hl_device *hdev)
return 0;
}

-static bool goya_is_device_idle(struct hl_device *hdev, char *buf, size_t size)
+static bool goya_is_device_idle(struct hl_device *hdev, struct seq_file *s)
{
- u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg,
- dma_core_sts;
+ const char *fmt = "%-5d%-9s%#-14x%#-16x%#x\n";
+ const char *dma_fmt = "%-5d%-9s%#-14x%#x\n";
+ u32 qm_glbl_sts0, cmdq_glbl_sts0, dma_core_sts0, tpc_cfg_sts,
+ mme_arch_sts;
+ bool is_idle = true, is_eng_idle;
+ u64 offset;
int i;

+ if (s)
+ seq_puts(s, "\nDMA is_idle QM_GLBL_STS0 DMA_CORE_STS0\n"
+ "--- ------- ------------ -------------\n");
+
offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;

for (i = 0 ; i < DMA_MAX_NUM ; i++) {
- dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
- dma_core_sts = mmDMA_CH_0_STS0 + i * offset;
+ qm_glbl_sts0 = RREG32(mmDMA_QM_0_GLBL_STS0 + i * offset);
+ dma_core_sts0 = RREG32(mmDMA_CH_0_STS0 + i * offset);
+ is_eng_idle = IS_DMA_QM_IDLE(qm_glbl_sts0) &&
+ IS_DMA_IDLE(dma_core_sts0);
+ is_idle &= is_eng_idle;

- if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) !=
- DMA_QM_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "DMA%d_QM", i);
-
- if (RREG32(dma_core_sts) & DMA_CH_0_STS0_DMA_BUSY_MASK)
- return HL_ENG_BUSY(buf, size, "DMA%d_CORE", i);
+ if (s)
+ seq_printf(s, dma_fmt, i, is_eng_idle ? "Y" : "N",
+ qm_glbl_sts0, dma_core_sts0);
}

+ if (s)
+ seq_puts(s,
+ "\nTPC is_idle QM_GLBL_STS0 CMDQ_GLBL_STS0 CFG_STATUS\n"
+ "--- ------- ------------ -------------- ----------\n");
+
offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;

for (i = 0 ; i < TPC_MAX_NUM ; i++) {
- tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
- tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
- tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
-
- if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) !=
- TPC_QM_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "TPC%d_QM", i);
-
- if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) !=
- TPC_CMDQ_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "TPC%d_CMDQ", i);
-
- if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) !=
- TPC_CFG_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "TPC%d_CFG", i);
- }
-
- if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) !=
- MME_QM_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "MME_QM");
-
- if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) !=
- MME_CMDQ_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "MME_CMDQ");
-
- if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) !=
- MME_ARCH_IDLE_MASK)
- return HL_ENG_BUSY(buf, size, "MME_ARCH");
-
- return true;
+ qm_glbl_sts0 = RREG32(mmTPC0_QM_GLBL_STS0 + i * offset);
+ cmdq_glbl_sts0 = RREG32(mmTPC0_CMDQ_GLBL_STS0 + i * offset);
+ tpc_cfg_sts = RREG32(mmTPC0_CFG_STATUS + i * offset);
+ is_eng_idle = IS_TPC_QM_IDLE(qm_glbl_sts0) &&
+ IS_TPC_CMDQ_IDLE(cmdq_glbl_sts0) &&
+ IS_TPC_IDLE(tpc_cfg_sts);
+ is_idle &= is_eng_idle;
+
+ if (s)
+ seq_printf(s, fmt, i, is_eng_idle ? "Y" : "N",
+ qm_glbl_sts0, cmdq_glbl_sts0, tpc_cfg_sts);
+ }
+
+ if (s)
+ seq_puts(s,
+ "\nMME is_idle QM_GLBL_STS0 CMDQ_GLBL_STS0 ARCH_STATUS\n"
+ "--- ------- ------------ -------------- -----------\n");
+
+ qm_glbl_sts0 = RREG32(mmMME_QM_GLBL_STS0);
+ cmdq_glbl_sts0 = RREG32(mmMME_CMDQ_GLBL_STS0);
+ mme_arch_sts = RREG32(mmMME_ARCH_STATUS);
+ is_eng_idle = IS_MME_QM_IDLE(qm_glbl_sts0) &&
+ IS_MME_CMDQ_IDLE(cmdq_glbl_sts0) &&
+ IS_MME_IDLE(mme_arch_sts);
+ is_idle &= is_eng_idle;
+
+ if (s) {
+ seq_printf(s, fmt, 0, is_eng_idle ? "Y" : "N", qm_glbl_sts0,
+ cmdq_glbl_sts0, mme_arch_sts);
+ seq_puts(s, "\n");
+ }
+
+ return is_idle;
}

static void goya_hw_queues_lock(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 5e4a631b3d88..2c9ea61099b4 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -557,7 +557,7 @@ struct hl_asic_funcs {
u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
int (*debug_coresight)(struct hl_device *hdev, void *data);
- bool (*is_device_idle)(struct hl_device *hdev, char *buf, size_t size);
+ bool (*is_device_idle)(struct hl_device *hdev, struct seq_file *s);
int (*soft_reset_late_init)(struct hl_device *hdev);
void (*hw_queues_lock)(struct hl_device *hdev);
void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -1112,12 +1112,6 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
(cond) ? 0 : -ETIMEDOUT; \
})

-#define HL_ENG_BUSY(buf, size, fmt, ...) ({ \
- if (buf) \
- snprintf(buf, size, fmt, ##__VA_ARGS__); \
- false; \
- })
-
struct hwmon_chip_info;

/**
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index c641c7eb6f7c..b04585af27ad 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -119,7 +119,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
if ((!max_size) || (!out))
return -EINVAL;

- hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, NULL, 0);
+ hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, NULL);

return copy_to_user(out, &hw_idle,
min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
--
2.17.1