Re: [PATCH V1] accel/amdxdna: Check for device hang on job timeout
From: Lizhi Hou
Date: Mon Apr 13 2026 - 12:47:40 EST
Applied to drm-misc-next
On 4/10/26 13:37, Mario Limonciello wrote:
On 4/9/26 12:58, Lizhi Hou wrote:
A job timeout does not necessarily indicate that the device is hung, asReviewed-by: Mario Limonciello (AMD) <superm1@xxxxxxxxxx>
it may still be processing other jobs.
Track whether any jobs have been successfully submitted or completed,
and use this information to determine if the device is making forward
progress. If so, return DRM_GPU_SCHED_STAT_NO_HANG instead of treating
the timeout as a device hang.
In the meanwhile the timeout interval is changed to 2 seconds which meets
the userspace requirement.
Signed-off-by: Lizhi Hou <lizhi.hou@xxxxxxx>
---
drivers/accel/amdxdna/aie2_ctx.c | 36 +++++++++++++++++++++++++++-----
drivers/accel/amdxdna/aie2_pci.h | 6 ++++++
2 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index f97755d60fa3..ddcf06a6b80c 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -27,7 +27,9 @@ static bool force_cmdlist = true;
module_param(force_cmdlist, bool, 0600);
MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
-#define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */
+uint tdr_timeout_ms = 2000;
+module_param(tdr_timeout_ms, int, 0400);
+MODULE_PARM_DESC(tdr_timeout_ms, "TDR (Timeout Detection and Recovery) timeout in milliseconds (0 = disable)");
struct aie2_ctx_health {
struct amdxdna_ctx_health header;
@@ -39,6 +41,24 @@ struct aie2_ctx_health {
u32 fatal_error_app_module;
};
+static inline void aie2_tdr_signal(struct amdxdna_dev *xdna)
+{
+ WRITE_ONCE(xdna->dev_handle->tdr_status, AIE2_TDR_SIGNALED);
+}
+
+static bool aie2_tdr_detect(struct amdxdna_dev *xdna)
+{
+ struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+
+ if (READ_ONCE(ndev->tdr_status) == AIE2_TDR_WAIT) {
+ XDNA_ERR(xdna, "TDR timeout detected");
+ return true;
+ }
+
+ WRITE_ONCE(ndev->tdr_status, AIE2_TDR_WAIT);
+ return false;
+}
+
static void aie2_job_release(struct kref *ref)
{
struct amdxdna_sched_job *job;
@@ -177,6 +197,7 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
+ aie2_tdr_signal(job->hwctx->client->xdna);
job->hwctx->priv->completed++;
dma_fence_signal(fence);
@@ -385,6 +406,8 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
aie2_job_put(job);
mmput(job->mm);
fence = ERR_PTR(ret);
+ } else {
+ aie2_tdr_signal(hwctx->client->xdna);
}
trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
@@ -415,9 +438,12 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
xdna = hwctx->client->xdna;
trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
- job->job_timeout = true;
- mutex_lock(&xdna->dev_lock);
+ guard(mutex)(&xdna->dev_lock);
+
+ if (!aie2_tdr_detect(xdna))
+ return DRM_GPU_SCHED_STAT_NO_HANG;
+
report = kzalloc_obj(*report);
if (!report)
goto reset_hwctx;
@@ -429,10 +455,10 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
job->aie2_job_health = report;
reset_hwctx:
+ job->job_timeout = true;
aie2_hwctx_stop(xdna, hwctx, sched_job);
aie2_hwctx_restart(xdna, hwctx);
- mutex_unlock(&xdna->dev_lock);
return DRM_GPU_SCHED_STAT_RESET;
}
@@ -608,7 +634,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
.ops = &sched_ops,
.num_rqs = DRM_SCHED_PRIORITY_COUNT,
.credit_limit = HWCTX_MAX_CMDS,
- .timeout = msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
+ .timeout = msecs_to_jiffies(tdr_timeout_ms),
.name = "amdxdna_js",
.dev = xdna->ddev.dev,
};
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 7c308672b5fe..81564483cb16 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -165,6 +165,11 @@ struct aie2_exec_msg_ops {
u32 (*get_chain_msg_op)(u32 cmd_op);
};
+enum aie2_tdr_status {
+ AIE2_TDR_WAIT,
+ AIE2_TDR_SIGNALED,
+};
+
struct amdxdna_dev_hdl {
struct aie_device aie;
const struct amdxdna_dev_priv *priv;
@@ -197,6 +202,7 @@ struct amdxdna_dev_hdl {
u32 hwctx_num;
struct amdxdna_async_error last_async_err;
+ enum aie2_tdr_status tdr_status;
};
struct aie2_hw_ops {