[PATCH 02/17] habanalabs: expose undefined opcode status via info ioctl

From: Oded Gabbay
Date: Mon Jun 20 2022 - 09:16:46 EST


From: Tal Cohen <talcohen@xxxxxxxxx>

The info ioctl retrieves information on the last undefined opcode
occurred.

Signed-off-by: Tal Cohen <talcohen@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
.../misc/habanalabs/common/habanalabs_ioctl.c | 25 ++++++++++++++++
include/uapi/misc/habanalabs.h | 30 +++++++++++++++++++
2 files changed, 55 insertions(+)

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index c7864d6bb0a1..fe7ed46cd1c5 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -610,6 +610,28 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
}

+static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ u32 max_size = args->return_size;
+ struct hl_info_undefined_opcode_event info = {0};
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ info.timestamp = ktime_to_ns(hdev->last_error.undef_opcode.timestamp);
+ info.engine_id = hdev->last_error.undef_opcode.engine_id;
+ info.cq_addr = hdev->last_error.undef_opcode.cq_addr;
+ info.cq_size = hdev->last_error.undef_opcode.cq_size;
+ info.stream_id = hdev->last_error.undef_opcode.stream_id;
+ info.cb_addr_streams_len = hdev->last_error.undef_opcode.cb_addr_streams_len;
+ memcpy(info.cb_addr_streams, hdev->last_error.undef_opcode.cb_addr_streams,
+ sizeof(info.cb_addr_streams));
+
+ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
@@ -718,6 +740,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_RAZWI_EVENT:
return razwi_info(hpriv, args);

+ case HL_INFO_UNDEFINED_OPCODE_EVENT:
+ return undefined_opcode_info(hpriv, args);
+
case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
return dev_mem_alloc_page_sizes_info(hpriv, args);

diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index c94b89cf1ec1..5f9a6097f5f3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -352,6 +352,7 @@ enum hl_server_type {
* HL_INFO_REGISTER_EVENTFD - Register eventfd for event notifications.
* HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
* HL_INFO_GET_EVENTS - Retrieve the last occurred events
+ * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -380,6 +381,7 @@ enum hl_server_type {
#define HL_INFO_REGISTER_EVENTFD 28
#define HL_INFO_UNREGISTER_EVENTFD 29
#define HL_INFO_GET_EVENTS 30
+#define HL_INFO_UNDEFINED_OPCODE_EVENT 31

#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -656,6 +658,34 @@ struct hl_info_razwi_event {
__u8 pad[2];
};

+#define MAX_QMAN_STREAMS_INFO 4
+#define OPCODE_INFO_MAX_ADDR_SIZE 8
+/**
+ * struct hl_info_undefined_opcode_event - info about last undefined opcode error
+ * @timestamp: timestamp of the undefined opcode error
+ * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ
+ * entiers. In case all streams array entries are
+ * filled with values, it means the execution was in Lower-CP.
+ * @cq_addr: the address of the current handled command buffer
+ * @cq_size: the size of the current handled command buffer
+ * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
+ * should be equal to 1 incase of undefined opcode
+ * in Upper-CP (specific stream) and equal to 4 incase
+ * of undefined opcode in Lower-CP.
+ * @engine_id: engine-id that the error occurred on
+ * @stream_id: the stream id the error occurred on. In case the stream equals to
+ * MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP.
+ */
+struct hl_info_undefined_opcode_event {
+ __s64 timestamp;
+ __u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE];
+ __u64 cq_addr;
+ __u32 cq_size;
+ __u32 cb_addr_streams_len;
+ __u32 engine_id;
+ __u32 stream_id;
+};
+
/**
* struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
* @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
--
2.25.1