[PATCH 1/2] habanalabs: add debugfs node for configuring CS timeout

From: Oded Gabbay
Date: Sun Sep 12 2021 - 01:29:24 EST


From: Ofir Bitton <obitton@xxxxxxxxx>

Command submission timeout is currently determined during driver
loading time. As some environments requires this timeout to be
modified in runtime, we introduce a new debugfs node that controls
the timeout value without the need to reload the driver.

Signed-off-by: Ofir Bitton <obitton@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
.../ABI/testing/debugfs-driver-habanalabs | 6 +++
drivers/misc/habanalabs/common/debugfs.c | 51 +++++++++++++++++++
2 files changed, 57 insertions(+)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index 284e2dfa61cd..63c46d9d538f 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -226,6 +226,12 @@ Description: Gets the state dump occurring on a CS timeout or failure.
Writing an integer X discards X state dumps, so that the
next read would return X+1-st newest state dump.

+What: /sys/kernel/debug/habanalabs/hl<n>/timeout_locked
+Date: Sep 2021
+KernelVersion: 5.16
+Contact: obitton@xxxxxxxxx
+Description: Sets the command submission timeout value in seconds.
+
What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
Date: Mar 2020
KernelVersion: 5.6
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 985f1f3dbd20..1f2a3dc6c4e2 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1167,6 +1167,45 @@ static ssize_t hl_state_dump_write(struct file *f, const char __user *buf,
return count;
}

+static ssize_t hl_timeout_locked_read(struct file *f, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ struct hl_device *hdev = entry->hdev;
+ char tmp_buf[200];
+ ssize_t rc;
+
+ if (*ppos)
+ return 0;
+
+ sprintf(tmp_buf, "%d\n",
+ jiffies_to_msecs(hdev->timeout_jiffies) / 1000);
+ rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
+ strlen(tmp_buf) + 1);
+
+ return rc;
+}
+
+static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ struct hl_device *hdev = entry->hdev;
+ u32 value;
+ ssize_t rc;
+
+ rc = kstrtouint_from_user(buf, count, 10, &value);
+ if (rc)
+ return rc;
+
+ if (value)
+ hdev->timeout_jiffies = msecs_to_jiffies(value * 1000);
+ else
+ hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
+ return count;
+}
+
static const struct file_operations hl_data32b_fops = {
.owner = THIS_MODULE,
.read = hl_data_read32,
@@ -1240,6 +1279,12 @@ static const struct file_operations hl_state_dump_fops = {
.write = hl_state_dump_write
};

+static const struct file_operations hl_timeout_locked_fops = {
+ .owner = THIS_MODULE,
+ .read = hl_timeout_locked_read,
+ .write = hl_timeout_locked_write
+};
+
static const struct hl_info_list hl_debugfs_list[] = {
{"command_buffers", command_buffers_show, NULL},
{"command_submission", command_submission_show, NULL},
@@ -1421,6 +1466,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
dev_entry,
&hl_state_dump_fops);

+ debugfs_create_file("timeout_locked",
+ 0644,
+ dev_entry->root,
+ dev_entry,
+ &hl_timeout_locked_fops);
+
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0444,
--
2.17.1