[PATCH 02/10] habanalabs: handle abort scenario for user interrupt

From: Oded Gabbay
Date: Sun Nov 07 2021 - 15:51:53 EST


From: Bharat Jauhari <bjauhari@xxxxxxxxx>

In case of device reset, the driver does a force trigger on all waiting
users to release them from waiting. However, the driver does not handle
error scenario while waiting.

hl_interrupt_wait_ioctl() now exits the wait in case of an error with
abort status.

Signed-off-by: Bharat Jauhari <bjauhari@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
.../habanalabs/common/command_submission.c | 34 +++++++++----------
1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 4c8000fd246c..41b48929cd59 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2768,7 +2768,7 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
u32 timeout_us, u64 user_address,
u64 target_value, u16 interrupt_offset,
- enum hl_cs_wait_status *status,
+ u32 *status,
u64 *timestamp)
{
struct hl_user_pending_interrupt *pend;
@@ -2815,13 +2815,14 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
}

if (completion_value >= target_value) {
- *status = CS_WAIT_STATUS_COMPLETED;
+ *status = HL_WAIT_CS_STATUS_COMPLETED;
/* There was no interrupt, we assume the completion is now. */
pend->fence.timestamp = ktime_get();
- } else
- *status = CS_WAIT_STATUS_BUSY;
+ } else {
+ *status = HL_WAIT_CS_STATUS_BUSY;
+ }

- if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED))
+ if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
goto remove_pending_user_interrupt;

wait_again:
@@ -2850,7 +2851,13 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
}

if (completion_value >= target_value) {
- *status = CS_WAIT_STATUS_COMPLETED;
+ *status = HL_WAIT_CS_STATUS_COMPLETED;
+ } else if (pend->fence.error) {
+ dev_err_ratelimited(hdev->dev,
+ "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
+ pend->fence.error);
+ /* set the command completion status as ABORTED */
+ *status = HL_WAIT_CS_STATUS_ABORTED;
} else {
timeout = completion_rc;
goto wait_again;
@@ -2861,7 +2868,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
interrupt->interrupt_id);
rc = -EINTR;
} else {
- *status = CS_WAIT_STATUS_BUSY;
+ *status = HL_WAIT_CS_STATUS_BUSY;
}

remove_pending_user_interrupt:
@@ -2883,7 +2890,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
struct hl_device *hdev = hpriv->hdev;
struct asic_fixed_properties *prop;
union hl_wait_cs_args *args = data;
- enum hl_cs_wait_status status;
+ u32 status = HL_WAIT_CS_STATUS_BUSY;
u64 timestamp;
int rc;

@@ -2926,22 +2933,13 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
}

memset(args, 0, sizeof(*args));
+ args->out.status = status;

if (timestamp) {
args->out.timestamp_nsec = timestamp;
args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
}

- switch (status) {
- case CS_WAIT_STATUS_COMPLETED:
- args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
- break;
- case CS_WAIT_STATUS_BUSY:
- default:
- args->out.status = HL_WAIT_CS_STATUS_BUSY;
- break;
- }
-
return 0;
}

--
2.25.1