Re: [PATCH V2 9/9] accel/amdxdna: Fill invalid payload for failed command
From: Mario Limonciello
Date: Fri Feb 27 2026 - 13:48:31 EST
The title is 9/9, but I didn't get copied on 0 through 8. Was that just a mistake when you made the patch you sent out one patch instead of 9?
On 2/26/26 6:48 PM, Lizhi Hou wrote:
Newer userspace applications may read the payload of a failed command
to obtain detailed error information. However, the driver and old firmware
versions may not support returning advanced error information.
In this case, initialize the command payload with an invalid value so
userspace can detect that no detailed error information is available.
Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
Signed-off-by: Lizhi Hou <lizhi.hou@xxxxxxx>
Would it also make sense to also detect the firmware version and indicate in the logs, IOCTL or a sysfs the feature is available for user space to detect?
Nonetheless, this approach is scalable.
Reviewed-by: Mario Limonciello (AMD) <superm1@xxxxxxxxxx>
---
drivers/accel/amdxdna/aie2_ctx.c | 23 ++++++++---------------
drivers/accel/amdxdna/amdxdna_ctx.c | 27 +++++++++++++++++++++++++++
drivers/accel/amdxdna/amdxdna_ctx.h | 3 +++
3 files changed, 38 insertions(+), 15 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 622ddbf7fb6f..eb4c9d919885 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -186,13 +186,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
cmd_abo = job->cmd_bo;
if (unlikely(job->job_timeout)) {
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
ret = -EINVAL;
goto out;
}
if (unlikely(!data) || unlikely(size != sizeof(u32))) {
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
ret = -EINVAL;
goto out;
}
@@ -202,7 +202,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
if (status == AIE2_STATUS_SUCCESS)
amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
else
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
out:
aie2_sched_notify(job);
@@ -244,13 +244,13 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
cmd_abo = job->cmd_bo;
if (unlikely(job->job_timeout)) {
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
ret = -EINVAL;
goto out;
}
if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
ret = -EINVAL;
goto out;
}
@@ -270,19 +270,12 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
fail_cmd_idx, fail_cmd_status);
if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT);
ret = -EINVAL;
- goto out;
+ } else {
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR);
}
- amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
- if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
- struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, NULL);
-
- cc->error_index = fail_cmd_idx;
- if (cc->error_index >= cc->command_count)
- cc->error_index = 0;
- }
out:
aie2_sched_notify(job);
return ret;
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index db3aa26fb55f..405d2c62789d 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -132,6 +132,33 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
return INVALID_CU_IDX;
}
+int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
+ struct amdxdna_sched_job *job, u32 cmd_idx,
+ enum ert_cmd_state error_state)
+{
+ struct amdxdna_client *client = job->hwctx->client;
+ struct amdxdna_cmd *cmd = abo->mem.kva;
+ struct amdxdna_cmd_chain *cc = NULL;
+
+ cmd->header &= ~AMDXDNA_CMD_STATE;
+ cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, error_state);
+
+ if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN) {
+ cc = amdxdna_cmd_get_payload(abo, NULL);
+ cc->error_index = (cmd_idx < cc->command_count) ? cmd_idx : 0;
+ abo = amdxdna_gem_get_obj(client, cc->data[0], AMDXDNA_BO_CMD);
+ if (!abo)
+ return -EINVAL;
+ cmd = abo->mem.kva;
+ }
+
+ memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
+ if (cc)
+ amdxdna_gem_put_obj(abo);
+
+ return 0;
+}
+
/*
* This should be called in close() and remove(). DO NOT call in other syscalls.
* This guarantee that when hwctx and resources will be released, if user
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index 16c85f08f03c..fbdf9d000871 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -167,6 +167,9 @@ amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
+int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
+ struct amdxdna_sched_job *job, u32 cmd_idx,
+ enum ert_cmd_state error_state);
void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
void amdxdna_hwctx_remove_all(struct amdxdna_client *client);