[PATCH 10/15] habanalabs: use graceful hard reset for CS timeouts

From: Oded Gabbay
Date: Thu Oct 27 2022 - 05:11:34 EST


From: Tomer Tayar <ttayar@xxxxxxxxx>

Use graceful hard reset when detecting a CS timeout that requires a
device reset.

Signed-off-by: Tomer Tayar <ttayar@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
.../misc/habanalabs/common/command_submission.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index fa05770865c6..f1c69c8ed74a 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -798,7 +798,7 @@ static void cs_do_release(struct kref *ref)
static void cs_timedout(struct work_struct *work)
{
struct hl_device *hdev;
- u64 event_mask;
+ u64 event_mask = 0x0;
int rc;
struct hl_cs *cs = container_of(work, struct hl_cs,
work_tdr.work);
@@ -830,11 +830,7 @@ static void cs_timedout(struct work_struct *work)
if (rc) {
hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
hdev->captured_err_info.cs_timeout.seq = cs->sequence;
-
- event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
- HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;
-
- hl_notifier_event_send_all(hdev, event_mask);
+ event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
}

switch (cs->type) {
@@ -869,8 +865,12 @@ static void cs_timedout(struct work_struct *work)

cs_put(cs);

- if (device_reset)
- hl_device_reset(hdev, HL_DRV_RESET_TDR);
+ if (device_reset) {
+ event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+ hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
+ } else if (event_mask) {
+ hl_notifier_event_send_all(hdev, event_mask);
+ }
}

static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
--
2.25.1