Re: [PATCH V1 4/6] accel/amdxdna: Add command doorbell and wait support

From: Mario Limonciello

Date: Wed May 06 2026 - 12:34:02 EST


On 5/6/26 11:11, Lizhi Hou wrote:

On 5/5/26 13:31, Mario Limonciello wrote:


On 5/5/26 11:09, Lizhi Hou wrote:
From: David Zhang <yidong.zhang@xxxxxxx>

Expose the command doorbell register to userspace on a per-hardware
context basis, enabling applications to notify the firmware of pending
commands via doorbell writes.

Introduce DRM_IOCTL_AMDXDNA_WAIT_CMD to allow userspace to wait for
completion of individual commands.

Co-developed-by: Hayden Laccabue <Hayden.Laccabue@xxxxxxx>
Signed-off-by: Hayden Laccabue <Hayden.Laccabue@xxxxxxx>
Signed-off-by: David Zhang <yidong.zhang@xxxxxxx>
Signed-off-by: Lizhi Hou <lizhi.hou@xxxxxxx>
Reviewed-by: Mario Limonciello (AMD) <superm1@xxxxxxxxxx>
Minor suggestion below.

---
  drivers/accel/amdxdna/aie4_ctx.c        | 75 +++++++++++++++++++++++++
  drivers/accel/amdxdna/aie4_host_queue.h |  2 +
  drivers/accel/amdxdna/aie4_pci.c        | 34 +++++++++++
  drivers/accel/amdxdna/aie4_pci.h        |  3 +
  drivers/accel/amdxdna/amdxdna_ctx.c     | 34 +++++++++++
  drivers/accel/amdxdna/amdxdna_ctx.h     |  4 +-
  drivers/accel/amdxdna/amdxdna_gem.c     |  5 +-
  drivers/accel/amdxdna/amdxdna_pci_drv.c | 18 +++++-
  drivers/accel/amdxdna/amdxdna_pci_drv.h |  3 +
  drivers/accel/amdxdna/npu3_regs.c       |  5 ++
  include/uapi/drm/amdxdna_accel.h        | 22 +++++++-
  11 files changed, 198 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/amdxdna/aie4_ctx.c b/drivers/accel/ amdxdna/aie4_ctx.c
index 84ac706d0ffb..8408b0d2696f 100644
--- a/drivers/accel/amdxdna/aie4_ctx.c
+++ b/drivers/accel/amdxdna/aie4_ctx.c
@@ -256,3 +256,78 @@ void aie4_hwctx_fini(struct amdxdna_hwctx *hwctx)
      aie4_hwctx_umq_fini(hwctx);
      kfree(hwctx->priv);
  }
+
+static inline bool valid_queue_index(u64 read, u64 write, u32 capacity)
+{
+    return (write >= read) && ((write - read) <= capacity);
+}
+
+static u64 get_read_index(struct amdxdna_hwctx *hwctx)
+{
+    u64 wi = READ_ONCE(*hwctx->priv->umq_write_index);
+    u64 ri = READ_ONCE(*hwctx->priv->umq_read_index);
+    struct amdxdna_dev *xdna = hwctx->client->xdna;
+
+    /*
+     * CERT cannot update read index as uint64 atomically. Driver may read
+     * half-updated read index when it has bits in high 32bit. In case read
+     * index is not valid, wait for some time and retry once. It should
+     * allow CERT to complete the read index update.
+     */
+    if (!valid_queue_index(ri, wi, CTX_MAX_CMDS)) {
+        XDNA_WARN(xdna, "Invalid index, ri %llu, wi %llu", ri, wi);
+        usleep_range(100, 200);
+        ri = READ_ONCE(*hwctx->priv->umq_read_index);
+        if (!valid_queue_index(ri, wi, CTX_MAX_CMDS)) {
+            XDNA_ERR(xdna, "Invalid index after retry, ri %llu, wi %llu", ri, wi);
+            ri = 0;
+        }
+    }
+
+    return ri;
+}
+
+static inline bool check_cmd_done(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+    u64 read_idx = get_read_index(hwctx);
+
+    return read_idx > seq;
+}
+
+int aie4_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout)
+{
+    unsigned long wait_jifs = MAX_SCHEDULE_TIMEOUT;
+    struct amdxdna_hwctx_priv *priv = hwctx->priv;
+    struct cert_comp *cert_comp = priv->cert_comp;
+    long ret;

Not sure I see the point in making ret a long. wait_event_interruptible_timeout() retun 0 or 1.

Other than 0 or 1, wait_event_interruptible_timeout() can also return the remaining jiffies and -ERESTARTSYS


Ah thanks.

Lizhi


bool val;
val = wait_event_interruptible_timeout()
return val ? 0 : -ETIME;


+
+    if (timeout)
+        wait_jifs = msecs_to_jiffies(timeout);
+
+    ret = wait_event_interruptible_timeout(cert_comp->waitq,
+                           (check_cmd_done(hwctx, seq)),
+                           wait_jifs);
+
+    if (!ret)
+        ret = -ETIME;
+
+    return ret <= 0 ? ret : 0;
+}
+
+int aie4_hwctx_valid_doorbell(struct amdxdna_client *client, u32 vm_pgoff)
+{
+    struct amdxdna_hwctx *hwctx;
+    unsigned long hwctx_id;
+    int idx;
+
+    idx = srcu_read_lock(&client->hwctx_srcu);
+    amdxdna_for_each_hwctx(client, hwctx_id, hwctx) {
+        if (vm_pgoff == (hwctx->doorbell_offset >> PAGE_SHIFT)) {
+            srcu_read_unlock(&client->hwctx_srcu, idx);
+            return 1;
+        }
+    }
+    srcu_read_unlock(&client->hwctx_srcu, idx);
+
+    return 0;
+}
diff --git a/drivers/accel/amdxdna/aie4_host_queue.h b/drivers/accel/ amdxdna/aie4_host_queue.h
index eb6a38dfb53e..1b33eda3f727 100644
--- a/drivers/accel/amdxdna/aie4_host_queue.h
+++ b/drivers/accel/amdxdna/aie4_host_queue.h
@@ -8,6 +8,8 @@
    #include <linux/types.h>
  +#define CTX_MAX_CMDS                    32
+
  struct host_queue_header {
      __u64 read_index;
      struct {
diff --git a/drivers/accel/amdxdna/aie4_pci.c b/drivers/accel/ amdxdna/aie4_pci.c
index 3be9066b7178..9ff34ce57fcb 100644
--- a/drivers/accel/amdxdna/aie4_pci.c
+++ b/drivers/accel/amdxdna/aie4_pci.c
@@ -503,6 +503,38 @@ static int aie4m_pcidev_init(struct amdxdna_dev *xdna)
      return 0;
  }
  +static int aie4_doorbell_mmap(struct amdxdna_client *client, struct vm_area_struct *vma)
+{
+    struct amdxdna_dev *xdna = client->xdna;
+    struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+    const struct amdxdna_dev_priv *npriv = xdna->dev_info->dev_priv;
+    phys_addr_t res_start;
+    unsigned long pfn;
+    int ret;
+
+    if (!aie4_hwctx_valid_doorbell(client, vma->vm_pgoff)) {
+        XDNA_ERR(xdna, "Invalid doorbell page offset 0x%lx", vma- >vm_pgoff);
+        return -EINVAL;
+    }
+
+    if (vma_pages(vma) != 1) {
+        XDNA_ERR(xdna, "can only map one page, got %ld", vma_pages(vma));
+        return -EINVAL;
+    }
+
+    res_start = pci_resource_start(pdev, xdna->dev_info- >doorbell_bar) + npriv->doorbell_off;
+    pfn = PHYS_PFN(res_start) + vma->vm_pgoff;
+    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+    vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP);
+    ret = io_remap_pfn_range(vma, vma->vm_start,
+                 pfn,
+                 PAGE_SIZE,
+                 vma->vm_page_prot);
+
+    XDNA_DBG(xdna, "doorbell ret %d", ret);
+    return ret;
+}
+
  static int aie4_pf_init(struct amdxdna_dev *xdna)
  {
      int ret;
@@ -547,4 +579,6 @@ const struct amdxdna_dev_ops aie4_vf_ops = {
      .fini            = aie4_vf_fini,
      .hwctx_init        = aie4_hwctx_init,
      .hwctx_fini        = aie4_hwctx_fini,
+    .mmap            = aie4_doorbell_mmap,
+    .cmd_wait        = aie4_cmd_wait,
  };
diff --git a/drivers/accel/amdxdna/aie4_pci.h b/drivers/accel/ amdxdna/aie4_pci.h
index 6103007e6d2f..b69489acd53d 100644
--- a/drivers/accel/amdxdna/aie4_pci.h
+++ b/drivers/accel/amdxdna/aie4_pci.h
@@ -36,6 +36,7 @@ struct amdxdna_dev_priv {
      u32            mbox_bar;
      u32            mbox_rbuf_bar;
      u64            mbox_info_off;
+    u32            doorbell_off;
        struct aie_bar_off_pair    psp_regs_off[PSP_MAX_REGS];
      struct aie_bar_off_pair    smu_regs_off[SMU_MAX_REGS];
@@ -60,6 +61,8 @@ int aie4_suspend_fw(struct amdxdna_dev_hdl *ndev);
  /* aie4_ctx.c */
  int aie4_hwctx_init(struct amdxdna_hwctx *hwctx);
  void aie4_hwctx_fini(struct amdxdna_hwctx *hwctx);
+int aie4_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
+int aie4_hwctx_valid_doorbell(struct amdxdna_client *client, u32 vm_pgoff);
    /* aie4_sriov.c */
  #if IS_ENABLED(CONFIG_PCI_IOV)
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/ amdxdna/amdxdna_ctx.c
index b5ad60d4b734..b79229a63af3 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -627,3 +627,37 @@ int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_
      XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
      return -EINVAL;
  }
+
+int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+    struct amdxdna_client *client = filp->driver_priv;
+    struct amdxdna_dev *xdna = to_xdna_dev(dev);
+    struct amdxdna_drm_wait_cmd *args = data;
+    struct amdxdna_hwctx *hwctx;
+    int ret, idx;
+
+    XDNA_DBG(xdna, "PID %d ctx %d timeout set %d ms for cmd %llu",
+         client->pid, args->hwctx, args->timeout, args->seq);
+
+    if (!xdna->dev_info->ops->cmd_wait)
+        return -EOPNOTSUPP;
+
+    idx = srcu_read_lock(&client->hwctx_srcu);
+    hwctx = xa_load(&client->hwctx_xa, args->hwctx);
+    if (!hwctx) {
+        XDNA_DBG(xdna, "PID %d failed to get ctx %d", client->pid, args->hwctx);
+        ret = -EINVAL;
+        goto unlock_ctx_srcu;
+    }
+
+    ret = xdna->dev_info->ops->cmd_wait(hwctx, args->seq, args- >timeout);
+
+    XDNA_DBG(xdna, "PID %d ctx %d cmd %lld wait finished, ret %d",
+         client->pid, args->hwctx, args->seq, ret);
+
+    trace_amdxdna_debug_point(current->comm, args->seq, "job returned to user");
+
+unlock_ctx_srcu:
+    srcu_read_unlock(&client->hwctx_srcu, idx);
+    return ret;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/ amdxdna/amdxdna_ctx.h
index c5622718b4d5..6e3c6371a088 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -211,12 +211,10 @@ int amdxdna_cmd_submit(struct amdxdna_client *client,
                 u32 *arg_bo_hdls, u32 arg_bo_cnt,
                 u32 hwctx_hdl, u64 *seq);
  -int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
-             u64 seq, u32 timeout);
-
  int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
  int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
  int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
  int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
    #endif /* _AMDXDNA_CTX_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/ amdxdna/amdxdna_gem.c
index ebfc472aa9e7..319d2064fafa 100644
--- a/drivers/accel/amdxdna/amdxdna_gem.c
+++ b/drivers/accel/amdxdna/amdxdna_gem.c
@@ -212,7 +212,8 @@ static bool amdxdna_hmm_invalidate(struct mmu_interval_notifier *mni,
      mmu_interval_set_seq(&mapp->notifier, cur_seq);
      up_write(&xdna->notifier_lock);
  -    xdna->dev_info->ops->hmm_invalidate(abo, cur_seq);
+    if (xdna->dev_info->ops->hmm_invalidate)
+        xdna->dev_info->ops->hmm_invalidate(abo, cur_seq);
        if (range->event == MMU_NOTIFY_UNMAP) {
          down_write(&xdna->notifier_lock);
@@ -295,7 +296,7 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
      u32 nr_pages;
      int ret;
  -    if (!xdna->dev_info->ops->hmm_invalidate)
+    if (!amdxdna_pasid_on(abo->client))
          return 0;
        mapp = kzalloc_obj(*mapp);
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/ amdxdna/amdxdna_pci_drv.c
index 39ad081ac082..c0d00db25cde 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -224,6 +224,21 @@ static int amdxdna_drm_set_state_ioctl(struct drm_device *dev, void *data, struc
      return ret;
  }
  +static int amdxdna_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+    struct drm_file *drm_filp = filp->private_data;
+    struct amdxdna_client *client = drm_filp->driver_priv;
+    struct amdxdna_dev *xdna = client->xdna;
+
+    if (likely(vma->vm_pgoff >= DRM_FILE_PAGE_OFFSET_START))
+        return drm_gem_mmap(filp, vma);
+
+    if (!xdna->dev_info->ops->mmap)
+        return -EOPNOTSUPP;
+
+    return xdna->dev_info->ops->mmap(client, vma);
+}
+
  static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
      /* Context */
      DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_HWCTX, amdxdna_drm_create_hwctx_ioctl, 0),
@@ -235,6 +250,7 @@ static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
      DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
      /* Execution */
      DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
+    DRM_IOCTL_DEF_DRV(AMDXDNA_WAIT_CMD, amdxdna_drm_wait_cmd_ioctl, 0),
      /* AIE hardware */
      DRM_IOCTL_DEF_DRV(AMDXDNA_GET_INFO, amdxdna_drm_get_info_ioctl, 0),
      DRM_IOCTL_DEF_DRV(AMDXDNA_GET_ARRAY, amdxdna_drm_get_array_ioctl, 0),
@@ -281,7 +297,7 @@ static const struct file_operations amdxdna_fops = {
      .poll        = drm_poll,
      .read        = drm_read,
      .llseek        = noop_llseek,
-    .mmap        = drm_gem_mmap,
+    .mmap        = amdxdna_drm_gem_mmap,
      .show_fdinfo    = drm_show_fdinfo,
      .fop_flags    = FOP_UNSIGNED_OFFSET,
  };
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/ amdxdna/amdxdna_pci_drv.h
index caed11c09e55..471b72299aee 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
@@ -56,12 +56,14 @@ struct amdxdna_dev_ops {
      int (*resume)(struct amdxdna_dev *xdna);
      int (*suspend)(struct amdxdna_dev *xdna);
      int (*sriov_configure)(struct amdxdna_dev *xdna, int num_vfs);
+    int (*mmap)(struct amdxdna_client *client, struct vm_area_struct *vma);
      int (*hwctx_init)(struct amdxdna_hwctx *hwctx);
      void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
      int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
      int (*hwctx_sync_debug_bo)(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl);
      void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
      int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
+    int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
      int (*get_aie_info)(struct amdxdna_client *client, struct amdxdna_drm_get_info *args);
      int (*set_aie_state)(struct amdxdna_client *client, struct amdxdna_drm_set_state *args);
      int (*get_array)(struct amdxdna_client *client, struct amdxdna_drm_get_array *args);
@@ -85,6 +87,7 @@ struct amdxdna_dev_info {
      int                sram_bar;
      int                psp_bar;
      int                smu_bar;
+    int                doorbell_bar;
      int                device_type;
      int                first_col;
      u32                dev_mem_buf_shift;
diff --git a/drivers/accel/amdxdna/npu3_regs.c b/drivers/accel/ amdxdna/npu3_regs.c
index 6d5da779232b..d76b2e99c308 100644
--- a/drivers/accel/amdxdna/npu3_regs.c
+++ b/drivers/accel/amdxdna/npu3_regs.c
@@ -14,6 +14,9 @@
  #define NPU3_MBOX_BUFFER_BAR    2
  #define NPU3_MBOX_INFO_OFF    0x0
  +#define NPU3_DOORBELL_BAR       2
+#define NPU3_DOORBELL_OFF       0x0
+
  /* PCIe BAR Index for NPU3 */
  #define NPU3_REG_BAR_INDEX    0
  #define NPU3_PSP_BAR_INDEX      4
@@ -45,6 +48,7 @@ static const struct amdxdna_dev_priv npu3_dev_priv = {
      .mbox_bar        = NPU3_MBOX_BAR,
      .mbox_rbuf_bar        = NPU3_MBOX_BUFFER_BAR,
      .mbox_info_off        = NPU3_MBOX_INFO_OFF,
+    .doorbell_off        = NPU3_DOORBELL_OFF,
      .psp_regs_off   = {
          DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU3_PSP, MPASP_C2PMSG_123_ALT_1),
          DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU3_PSP, MPASP_C2PMSG_156_ALT_1),
@@ -87,6 +91,7 @@ const struct amdxdna_dev_info dev_npu3_pf_info = {
  const struct amdxdna_dev_info dev_npu3_vf_info = {
      .mbox_bar        = NPU3_MBOX_BAR,
      .sram_bar        = NPU3_MBOX_BUFFER_BAR,
+    .doorbell_bar        = NPU3_DOORBELL_BAR,
      .default_vbnv        = "RyzenAI-npu3-vf",
      .device_type        = AMDXDNA_DEV_TYPE_UMQ,
      .dev_priv        = &npu3_dev_vf_priv,
diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/ amdxdna_accel.h
index ad9b33dd7b13..51a507561df6 100644
--- a/include/uapi/drm/amdxdna_accel.h
+++ b/include/uapi/drm/amdxdna_accel.h
@@ -45,7 +45,8 @@ enum amdxdna_drm_ioctl_id {
      DRM_AMDXDNA_EXEC_CMD,
      DRM_AMDXDNA_GET_INFO,
      DRM_AMDXDNA_SET_STATE,
-    DRM_AMDXDNA_GET_ARRAY = 10,
+    DRM_AMDXDNA_WAIT_CMD,
+    DRM_AMDXDNA_GET_ARRAY,
  };
    /**
@@ -274,6 +275,21 @@ struct amdxdna_drm_exec_cmd {
      __u64 seq;
  };
  +/**
+ * struct amdxdna_drm_wait_cmd - Wait execution command.
+ *
+ * @hwctx: Context handle.
+ * @timeout: timeout in ms, 0 implies infinite wait.
+ * @seq: sequence number of the command returned by execute command.
+ *
+ * Wait a command specified by seq to be completed.
+ */
+struct amdxdna_drm_wait_cmd {
+    __u32 hwctx;
+    __u32 timeout;
+    __u64 seq;
+};
+
  /**
   * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware
   * @buffer: The user space buffer that will return the AIE status.
@@ -739,6 +755,10 @@ struct amdxdna_drm_set_power_mode {
      DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_ARRAY, \
           struct amdxdna_drm_get_array)
  +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
+    DRM_IOW(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \
+        struct amdxdna_drm_wait_cmd)
+
  #if defined(__cplusplus)
  } /* extern c end */
  #endif