Re: [PATCH v3] drm/imagination: Fix double call to drm_sched_entity_fini()

From: Alessio Belle

Date: Thu Jun 18 2026 - 15:48:01 EST


Hi Brajesh,

On Thu, 2026-06-11 at 12:47 +0530, Brajesh Gupta wrote:
> Call sequence of double call:
> pvr_context_destroy
>   pvr_context_kill_queues
>     pvr_queue_kill
>       drm_sched_entity_destroy
>         drm_sched_entity_fini // here
>   pvr_context_put
>     kref_put(..., pvr_context_release)
>       pvr_context_destroy_queues
>         pvr_queue_destroy
>           drm_sched_entity_fini // here
>
> Call to drm_sched_entity_destroy() from pvr_context_kill_queues() calls
> drm_sched_entity_flush() + drm_sched_entity_fini().
> drm_sched_entity_flush() ensures all pending jobs are completed and
> drm_sched_entity_fini() ensures no further submission is allowed as
> per expectation from pvr_context_kill_queues(). Drop the extra
> call to drm_sched_entity_fini().
>
> Stack trace for issue with addition of refcounting for DRM entity
> stats in commit fd177135f0e6 ("drm/sched: Account entity GPU time"):
> [ 789.490527] ------------[ cut here ]------------
> [ 789.490559] refcount_t: underflow; use-after-free.
> [ 789.490657] WARNING: lib/refcount.c:28 at refcount_warn_saturate+0xf4/0x144, CPU#0: kworker/u16:1/440
> [ 789.490695] Modules linked in: powervr drm_gpuvm drm_exec gpu_sched drm_shmem_helper xhci_plat_hcd xhci_hcd dwc3 usbcore usb_common snd_soc_simple_card snd_soc_simple_card_utils sa2ul sha512 sha256 dwc3_am62 sha1 authenc rti_wdt libsha512 at24 sch_fq_codel fuse dm_mod ipv6
> [ 789.490798] CPU: 0 UID: 0 PID: 440 Comm: kworker/u16:1 Not tainted 7.0.0-rc7-02049-g5e2c0700091b #22 PREEMPT
> [ 789.490809] Hardware name: Texas Instruments AM625 SK (DT)
> [ 789.490815] Workqueue: powervr-sched pvr_queue_fence_release_work [powervr]
> [ 789.490868] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [ 789.490876] pc : refcount_warn_saturate+0xf4/0x144
> [ 789.490884] lr : refcount_warn_saturate+0xf4/0x144
> [ 789.490892] sp : ffff8000822cbcc0
> [ 789.490895] x29: ffff8000822cbcc0 x28: 0000000000000000 x27: 0000000000000000
> [ 789.490909] x26: 0000000000000000 x25: ffff800081b1e338 x24: ffff000004541405
> [ 789.490922] x23: ffff000004bea950 x22: ffff00000042e400 x21: ffff000007123e30
> [ 789.490935] x20: ffff000007123000 x19: ffff000007a80d50 x18: fffffffffffe7768
> [ 789.490948] x17: 74736574202c6e6f x16: 697461746e656d65 x15: ffff800081b269f0
> [ 789.490962] x14: 0000000000000030 x13: ffff800081b26a70 x12: 0000000000000211
> [ 789.490975] x11: 00000000000000c0 x10: 0000000000000b50 x9 : ffff8000822cbb30
> [ 789.490988] x8 : ffff0000014e7bb0 x7 : ffff00007725e780 x6 : 0000000372a05f49
> [ 789.491001] x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000010
> [ 789.491013] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000014e7000
> [ 789.491027] Call trace:
> [ 789.491032] refcount_warn_saturate+0xf4/0x144 (P)
> [ 789.491043] drm_sched_entity_fini+0x164/0x18c [gpu_sched]
> [ 789.491081] pvr_queue_destroy+0x64/0x134 [powervr]
> [ 789.491110] pvr_context_destroy_queues+0x34/0x64 [powervr]
> [ 789.491138] pvr_context_release+0x70/0xac [powervr]
> [ 789.491166] pvr_context_put.part.0+0x5c/0x7c [powervr]
> [ 789.491193] pvr_context_put+0x14/0x24 [powervr]
> [ 789.491221] pvr_queue_fence_release_work+0x20/0x38 [powervr]
> [ 789.491249] process_one_work+0x160/0x4c4
> [ 789.491264] worker_thread+0x188/0x310
> [ 789.491276] kthread+0x130/0x13c
> [ 789.491287] ret_from_fork+0x10/0x20
> [ 789.491300] ---[ end trace 0000000000000000 ]---
>
> Signed-off-by: Brajesh Gupta <brajesh.gupta@xxxxxxxxxx>
> Fixes: eaf01ee5ba28 ("drm/imagination: Implement job submission and scheduling")
> References: fd177135f0e6 ("drm/sched: Account entity GPU time")
> ---
> Changes in v3:
> - Fixed a typo.
> - Handled missing memory leak for RENDER_CONTEXT.
> - Link to v2: https://lore.kernel.org/r/20260611-b4-sched_fix-v2-1-17a93be86fcd@xxxxxxxxxx
>
> Changes in v2:
> - Fixed memory leak identified in following error path handling of pvr_context_create():
> - pvr_context_create()
> - ...
> - err_destroy_queues:
> - pvr_context_destroy_queues()
> - pvr_queue_destroy()
> - Link to v1: https://lore.kernel.org/r/20260610-b4-sched_fix-v1-1-c5977a6e0b4c@xxxxxxxxxx
> ---
> drivers/gpu/drm/imagination/pvr_context.c | 41 +++++++++++++++++++++++++------
> drivers/gpu/drm/imagination/pvr_queue.c | 4 ---
> 2 files changed, 33 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/imagination/pvr_context.c b/drivers/gpu/drm/imagination/pvr_context.c
> index eba4694400b5..e2e30908dfce 100644
> --- a/drivers/gpu/drm/imagination/pvr_context.c
> +++ b/drivers/gpu/drm/imagination/pvr_context.c
> @@ -161,22 +161,47 @@ ctx_fw_data_init(void *cpu_ptr, void *priv)
> /**
> * pvr_context_destroy_queues() - Destroy all queues attached to a context.
> * @ctx: Context to destroy queues on.
> + * @queue_entity_fini: Corresponding queues of context to be released for
> + * failure path in context creation.
> *
> * Should be called when the last reference to a context object is dropped.
> * It releases all resources attached to the queues bound to this context.
> */
> -static void pvr_context_destroy_queues(struct pvr_context *ctx)
> +static void pvr_context_destroy_queues(struct pvr_context *ctx, bool queue_entity_fini)
> {
> switch (ctx->type) {
> case DRM_PVR_CTX_TYPE_RENDER:
> - pvr_queue_destroy(ctx->queues.fragment);
> - pvr_queue_destroy(ctx->queues.geometry);
> + if (ctx->queues.fragment) {
> + if (queue_entity_fini)
> + drm_sched_entity_fini(&ctx->queues.fragment->entity);
> +
> + pvr_queue_destroy(ctx->queues.fragment);
> + }
> +
> + if (ctx->queues.geometry) {
> + if (queue_entity_fini)
> + drm_sched_entity_fini(&ctx->queues.geometry->entity);
> +
> + pvr_queue_destroy(ctx->queues.geometry);
> + }

Would it make sense to push the new flag queue_entity_fini into
pvr_queue_destroy() and keep calling drm_sched_entity_fini() from there but
conditionally, to avoid all the new ifs around here?

A potential alternative, skimming through the scheduler internals, could be to
replace drm_sched_entity_destroy() in pvr_queue_kill() with
drm_sched_entity_flush() + drm_sched_entity_kill() (not fini()), just not
entirely sure that is expected usage given that the commit message from [1]
seems to suggest kill() is to be used in place of flush() in general (calling
both might not hurt).

Also if this needs backporting, which is unclear since the bug didn't seem to do
anything visible until the referenced issue, you'd have to go with the current
fix for the backports, since drm_sched_entity_kill() hasn't made it to any
kernel release yet.

[1] https://lore.kernel.org/dri-devel/20260415144956.272506-2-phasta@xxxxxxxxxx/

Thanks,
Alessio

> break;
> +
> case DRM_PVR_CTX_TYPE_COMPUTE:
> - pvr_queue_destroy(ctx->queues.compute);
> + if (ctx->queues.compute) {
> + if (queue_entity_fini)
> + drm_sched_entity_fini(&ctx->queues.compute->entity);
> +
> + pvr_queue_destroy(ctx->queues.compute);
> + }
> break;
> +
> case DRM_PVR_CTX_TYPE_TRANSFER_FRAG:
> - pvr_queue_destroy(ctx->queues.transfer);
> + if (ctx->queues.transfer) {
> + if (queue_entity_fini)
> + drm_sched_entity_fini(&ctx->queues.transfer->entity);
> +
> + pvr_queue_destroy(ctx->queues.transfer);
> + }
> break;
> }
> }
> @@ -240,7 +265,7 @@ static int pvr_context_create_queues(struct pvr_context *ctx,
> return -EINVAL;
>
> err_destroy_queues:
> - pvr_context_destroy_queues(ctx);
> + pvr_context_destroy_queues(ctx, true);
> return err;
> }
>
> @@ -349,7 +374,7 @@ int pvr_context_create(struct pvr_file *pvr_file, struct drm_pvr_ioctl_create_co
> pvr_fw_object_destroy(ctx->fw_obj);
>
> err_destroy_queues:
> - pvr_context_destroy_queues(ctx);
> + pvr_context_destroy_queues(ctx, true);
>
> err_free_ctx_id:
> /*
> @@ -384,7 +409,7 @@ pvr_context_release(struct kref *ref_count)
> spin_unlock(&pvr_dev->ctx_list_lock);
>
> xa_erase(&pvr_dev->ctx_ids, ctx->ctx_id);
> - pvr_context_destroy_queues(ctx);
> + pvr_context_destroy_queues(ctx, false);
> pvr_fw_object_destroy(ctx->fw_obj);
> kfree(ctx->data);
> pvr_vm_context_put(ctx->vm_ctx);
> diff --git a/drivers/gpu/drm/imagination/pvr_queue.c b/drivers/gpu/drm/imagination/pvr_queue.c
> index 7ed60e1c1a86..15cd8adc7528 100644
> --- a/drivers/gpu/drm/imagination/pvr_queue.c
> +++ b/drivers/gpu/drm/imagination/pvr_queue.c
> @@ -1445,15 +1445,11 @@ void pvr_queue_kill(struct pvr_queue *queue)
> */
> void pvr_queue_destroy(struct pvr_queue *queue)
> {
> - if (!queue)
> - return;
> -
> mutex_lock(&queue->ctx->pvr_dev->queues.lock);
> list_del_init(&queue->node);
> mutex_unlock(&queue->ctx->pvr_dev->queues.lock);
>
> drm_sched_fini(&queue->scheduler);
> - drm_sched_entity_fini(&queue->entity);
>
> if (WARN_ON(queue->last_queued_job_scheduled_fence))
> dma_fence_put(queue->last_queued_job_scheduled_fence);
>
> ---
> base-commit: 61de054a772a1feda6364931ab1baf9038abf1c8
> change-id: 20260610-b4-sched_fix-ac3b920f475b
>
> Best regards,