[PATCH] accel/rocket: Add per-task flags and interrupt mask for flexible job handling
From: Ross Cawston
Date: Mon Feb 16 2026 - 13:45:33 EST
The Rocket NPU supports multiple task types:
- Convolutional workloads that use CNA, Core, and DPU blocks
- Standalone post-processing (PPU) tasks such as pooling and element-wise operations
- Pipelined DPU→PPU workloads
The current driver has several limitations that prevent correct execution of
non-convolutional workloads and multi-core operation:
- CNA and Core S_POINTER registers are always initialized, re-arming them
with stale state from previous jobs and corrupting standalone DPU/PPU tasks.
- Completion is hard-coded to wait only for DPU interrupts, causing PPU-only
or DPU→PPU pipeline jobs to time out.
- Ping-pong mode is unconditionally enabled, which is unnecessary for
single-task jobs.
- Non-zero cores hang because the vendor-specific "extra bit" (bit 28 × core
index) in S_POINTER is not set; the BSP sets this via MMIO because userspace
cannot know which core the scheduler will select.
- Timeout and IRQ debugging information is minimal.
This patch introduces two new per-task fields to struct rocket_task:
- u32 int_mask: specifies which block completion interrupts signal task done
(DPU_0|DPU_1 for convolutional/standalone DPU, PPU_0|PPU_1 for PPU tasks).
Zero defaults to DPU_0|DPU_1 for backward compatibility.
- u32 flags: currently used for ROCKET_TASK_NO_CNA_CORE to indicate standalone
DPU/PPU tasks that must not touch CNA/Core state.
Additional changes:
- Only initialize CNA and Core S_POINTER (with the required per-core extra bit)
when ROCKET_TASK_NO_CNA_CORE is not set.
- Set the per-core extra bit via MMIO to fix hangs on non-zero cores.
- Enable ping-pong mode only when the job contains multiple tasks.
- Mask and clear interrupts according to the task's int_mask.
- Accept both DPU and PPU completion interrupts in the IRQ handler.
- Minor error-path fix in GEM object creation (check error after unlocking
mm_lock).
These changes, derived from vendor BSP behavior, enable correct execution
of PPU-only tasks, pipelined workloads, and reliable multi-core operation
while preserving backward compatibility.
---
drivers/accel/rocket/rocket_gem.c | 2 +
drivers/accel/rocket/rocket_job.c | 99 +++++++++++++++++++++++++------
drivers/accel/rocket/rocket_job.h | 2 +
include/uapi/drm/rocket_accel.h | 30 ++++++++++
4 files changed, 115 insertions(+), 18 deletions(-)
diff --git a/drivers/accel/rocket/rocket_gem.c b/drivers/accel/rocket/rocket_gem.c
index 624c4ecf5a34..db1ff3544af2 100644
--- a/drivers/accel/rocket/rocket_gem.c
+++ b/drivers/accel/rocket/rocket_gem.c
@@ -95,6 +95,8 @@ int rocket_ioctl_create_bo(struct drm_device *dev, void *data, struct drm_file *
rkt_obj->size, PAGE_SIZE,
0, 0);
mutex_unlock(&rocket_priv->mm_lock);
+ if (ret)
+ goto err;
ret = iommu_map_sgtable(rocket_priv->domain->domain,
rkt_obj->mm.start,
diff --git a/drivers/accel/rocket/rocket_job.c b/drivers/accel/rocket/rocket_job.c
index acd606160dc9..dd69b195d0e6 100644
--- a/drivers/accel/rocket/rocket_job.c
+++ b/drivers/accel/rocket/rocket_job.c
@@ -96,6 +96,13 @@ rocket_copy_tasks(struct drm_device *dev,
rjob->tasks[i].regcmd = task.regcmd;
rjob->tasks[i].regcmd_count = task.regcmd_count;
+ rjob->tasks[i].int_mask = task.int_mask;
+ rjob->tasks[i].flags = task.flags;
+
+ /* Default to DPU completion if no mask specified */
+ if (!rjob->tasks[i].int_mask)
+ rjob->tasks[i].int_mask = PC_INTERRUPT_MASK_DPU_0 |
+ PC_INTERRUPT_MASK_DPU_1;
}
return 0;
@@ -108,7 +115,6 @@ rocket_copy_tasks(struct drm_device *dev,
static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *job)
{
struct rocket_task *task;
- unsigned int extra_bit;
/* Don't queue the job if a reset is in progress */
if (atomic_read(&core->reset.pending))
@@ -121,29 +127,61 @@ static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *jo
rocket_pc_writel(core, BASE_ADDRESS, 0x1);
- /* From rknpu, in the TRM this bit is marked as reserved */
- extra_bit = 0x10000000 * core->index;
- rocket_cna_writel(core, S_POINTER, CNA_S_POINTER_POINTER_PP_EN(1) |
- CNA_S_POINTER_EXECUTER_PP_EN(1) |
- CNA_S_POINTER_POINTER_PP_MODE(1) |
- extra_bit);
-
- rocket_core_writel(core, S_POINTER, CORE_S_POINTER_POINTER_PP_EN(1) |
- CORE_S_POINTER_EXECUTER_PP_EN(1) |
- CORE_S_POINTER_POINTER_PP_MODE(1) |
- extra_bit);
+ /*
+ * Initialize CNA and Core S_POINTER for ping-pong mode via MMIO.
+ *
+ * Each core needs a per-core extra_bit (bit 28 * core_index) which
+ * the TRM marks as reserved but the BSP rknpu driver sets. Without
+ * it, non-zero cores hang. This MUST be done via MMIO (not regcmd)
+ * because userspace doesn't know which core the scheduler picks.
+ *
+ * DPU/DPU_RDMA and PPU/PPU_RDMA S_POINTERs are set by the regcmd
+ * itself — they don't need the per-core extra_bit.
+ *
+ * For standalone DPU/PPU tasks (element-wise ops, pooling), CNA
+ * and Core have no work. Writing their S_POINTERs would re-arm
+ * them with stale state from the previous conv task, corrupting
+ * the DPU/PPU output. Userspace signals this via the
+ * ROCKET_TASK_NO_CNA_CORE flag.
+ */
+ if (!(task->flags & ROCKET_TASK_NO_CNA_CORE)) {
+ unsigned int extra_bit = 0x10000000 * core->index;
+ rocket_cna_writel(core, S_POINTER,
+ CNA_S_POINTER_POINTER_PP_EN(1) |
+ CNA_S_POINTER_EXECUTER_PP_EN(1) |
+ CNA_S_POINTER_POINTER_PP_MODE(1) |
+ extra_bit);
+
+ rocket_core_writel(core, S_POINTER,
+ CORE_S_POINTER_POINTER_PP_EN(1) |
+ CORE_S_POINTER_EXECUTER_PP_EN(1) |
+ CORE_S_POINTER_POINTER_PP_MODE(1) |
+ extra_bit);
+ }
rocket_pc_writel(core, BASE_ADDRESS, task->regcmd);
rocket_pc_writel(core, REGISTER_AMOUNTS,
PC_REGISTER_AMOUNTS_PC_DATA_AMOUNT((task->regcmd_count + 1) / 2 - 1));
- rocket_pc_writel(core, INTERRUPT_MASK, PC_INTERRUPT_MASK_DPU_0 | PC_INTERRUPT_MASK_DPU_1);
- rocket_pc_writel(core, INTERRUPT_CLEAR, PC_INTERRUPT_CLEAR_DPU_0 | PC_INTERRUPT_CLEAR_DPU_1);
+ /*
+ * Enable interrupts for the last block in this task's pipeline.
+ *
+ * The int_mask field from userspace specifies which block completion
+ * signals that this task is done:
+ * - Conv/DPU tasks: DPU_0 | DPU_1
+ * - PPU tasks (DPU→PPU pipeline): PPU_0 | PPU_1
+ *
+ * Only enabling the terminal block's interrupt prevents the kernel
+ * from stopping the pipeline early (e.g. DPU fires before PPU has
+ * finished writing its output).
+ */
+ rocket_pc_writel(core, INTERRUPT_MASK, task->int_mask);
+ rocket_pc_writel(core, INTERRUPT_CLEAR, 0x1ffff);
rocket_pc_writel(core, TASK_CON, PC_TASK_CON_RESERVED_0(1) |
PC_TASK_CON_TASK_COUNT_CLEAR(1) |
PC_TASK_CON_TASK_NUMBER(1) |
- PC_TASK_CON_TASK_PP_EN(1));
+ PC_TASK_CON_TASK_PP_EN(job->task_count > 1 ? 1 : 0));
rocket_pc_writel(core, TASK_DMA_BASE_ADDR, PC_TASK_DMA_BASE_ADDR_DMA_BASE_ADDR(0x0));
@@ -385,7 +423,23 @@ static enum drm_gpu_sched_stat rocket_job_timedout(struct drm_sched_job *sched_j
struct rocket_device *rdev = job->rdev;
struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
- dev_err(core->dev, "NPU job timed out");
+ {
+ u32 raw = rocket_pc_readl(core, INTERRUPT_RAW_STATUS);
+ u32 status = rocket_pc_readl(core, INTERRUPT_STATUS);
+ u32 mask = rocket_pc_readl(core, INTERRUPT_MASK);
+ u32 op_en = rocket_pc_readl(core, OPERATION_ENABLE);
+ u32 task_status = rocket_pc_readl(core, TASK_STATUS);
+ u32 cna_s_status = rocket_cna_readl(core, S_STATUS);
+ u32 core_s_status = rocket_core_readl(core, S_STATUS);
+ u32 core_misc = readl(core->core_iomem + 0x10); /* MISC_CFG */
+ u32 core_op_en = readl(core->core_iomem + 0x08); /* OPERATION_ENABLE */
+
+ dev_err(core->dev,
+ "NPU job timed out: raw=0x%08x mask=0x%08x op_en=0x%x task_status=0x%x cna_s=0x%x core_s=0x%x core_misc=0x%x core_op_en=0x%x task=%u/%u",
+ raw, mask, op_en, task_status,
+ cna_s_status, core_s_status, core_misc, core_op_en,
+ job->next_task_idx, job->task_count);
+ }
atomic_set(&core->reset.pending, 1);
rocket_reset(core, sched_job);
@@ -424,8 +478,17 @@ static irqreturn_t rocket_job_irq_handler(int irq, void *data)
WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR);
WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_WRITE_ERROR);
- if (!(raw_status & PC_INTERRUPT_RAW_STATUS_DPU_0 ||
- raw_status & PC_INTERRUPT_RAW_STATUS_DPU_1))
+ /*
+ * Check for any job completion interrupt: DPU or PPU.
+ *
+ * Conv and standalone DPU jobs signal via DPU_0/DPU_1.
+ * PPU pooling jobs signal via PPU_0/PPU_1.
+ * We must recognize both to avoid PPU job timeouts.
+ */
+ if (!(raw_status & (PC_INTERRUPT_RAW_STATUS_DPU_0 |
+ PC_INTERRUPT_RAW_STATUS_DPU_1 |
+ PC_INTERRUPT_RAW_STATUS_PPU_0 |
+ PC_INTERRUPT_RAW_STATUS_PPU_1)))
return IRQ_NONE;
rocket_pc_writel(core, INTERRUPT_MASK, 0x0);
diff --git a/drivers/accel/rocket/rocket_job.h b/drivers/accel/rocket/rocket_job.h
index 4ae00feec3b9..6931dfed8615 100644
--- a/drivers/accel/rocket/rocket_job.h
+++ b/drivers/accel/rocket/rocket_job.h
@@ -13,6 +13,8 @@
struct rocket_task {
u64 regcmd;
u32 regcmd_count;
+ u32 int_mask;
+ u32 flags;
};
struct rocket_job {
diff --git a/include/uapi/drm/rocket_accel.h b/include/uapi/drm/rocket_accel.h
index 14b2e12b7c49..b041bcb05e27 100644
--- a/include/uapi/drm/rocket_accel.h
+++ b/include/uapi/drm/rocket_accel.h
@@ -73,6 +73,11 @@ struct drm_rocket_fini_bo {
__u32 reserved;
};
+/**
+ * Flags for drm_rocket_task.flags
+ */
+#define ROCKET_TASK_NO_CNA_CORE 0x1
+
/**
* struct drm_rocket_task - A task to be run on the NPU
*
@@ -84,6 +89,31 @@ struct drm_rocket_task {
/** Input: Number of commands in the register command buffer */
__u32 regcmd_count;
+
+ /**
+ * Input: Interrupt mask specifying which block completion signals
+ * that this task is done. Uses PC_INTERRUPT_MASK_* bits.
+ *
+ * For conv/DPU tasks: DPU_0 | DPU_1 (0x0300)
+ * For PPU tasks: PPU_0 | PPU_1 (0x0C00)
+ *
+ * If zero, defaults to DPU_0 | DPU_1 for backwards compatibility.
+ */
+ __u32 int_mask;
+
+ /**
+ * Input: Task flags.
+ *
+ * ROCKET_TASK_NO_CNA_CORE: Skip CNA and Core S_POINTER MMIO
+ * writes for this task. Used for standalone DPU element-wise
+ * and PPU pooling tasks that don't use CNA/Core. Without this
+ * flag, CNA/Core get re-armed with stale state from the
+ * previous conv task, corrupting the DPU/PPU output.
+ *
+ * Zero means write CNA/Core S_POINTER (default for conv tasks,
+ * backwards compatible with old userspace).
+ */
+ __u32 flags;
};
/**
--
2.52.0