Re: [PATCH] drm/etnaviv: print offender task information on hangcheck recovery
From: Lucas Stach
Date: Fri Aug 19 2022 - 13:13:11 EST
Am Mittwoch, dem 22.06.2022 um 10:52 +0200 schrieb Lucas Stach:
> Hi Christian,
>
> Am Freitag, dem 03.06.2022 um 14:37 +0200 schrieb Christian Gmeiner:
> > Track the pid per submit, so we can print the name and cmdline of
> > the task which submitted the batch that caused the gpu to hang.
> >
> I really like the idea. I think the pid handling could be integrated
> into the scheduler, so we don't have to carry it on each submit, but
> not requesting any changes right now. I'm leaning toward taking this
> patch as-is and doing the scheduler integration as a second step.
>
Applied to etnaviv/next.
Regards,
Lucas
>
> > Signed-off-by: Christian Gmeiner <christian.gmeiner@xxxxxxxxx>
> > ---
> > drivers/gpu/drm/etnaviv/etnaviv_gem.h | 1 +
> > drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 6 ++++++
> > drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 18 +++++++++++++++++-
> > drivers/gpu/drm/etnaviv/etnaviv_gpu.h | 2 +-
> > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +-
> > 5 files changed, 26 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.h b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
> > index 63688e6e4580..baa81cbf701a 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.h
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
> > @@ -96,6 +96,7 @@ struct etnaviv_gem_submit {
> > int out_fence_id;
> > struct list_head node; /* GPU active submit list */
> > struct etnaviv_cmdbuf cmdbuf;
> > + struct pid *pid; /* submitting process */
> > bool runtime_resumed;
> > u32 exec_state;
> > u32 flags;
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> > index 1ac916b24891..1491159d0d20 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> > @@ -399,6 +399,9 @@ static void submit_cleanup(struct kref *kref)
> > mutex_unlock(&submit->gpu->fence_lock);
> > dma_fence_put(submit->out_fence);
> > }
> > +
> > + put_pid(submit->pid);
> > +
> > kfree(submit->pmrs);
> > kfree(submit);
> > }
> > @@ -422,6 +425,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
> > struct sync_file *sync_file = NULL;
> > struct ww_acquire_ctx ticket;
> > int out_fence_fd = -1;
> > + struct pid *pid = get_pid(task_pid(current));
> > void *stream;
> > int ret;
> >
> > @@ -519,6 +523,8 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
> > goto err_submit_ww_acquire;
> > }
> >
> > + submit->pid = pid;
> > +
> > ret = etnaviv_cmdbuf_init(priv->cmdbuf_suballoc, &submit->cmdbuf,
> > ALIGN(args->stream_size, 8) + 8);
> > if (ret)
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> > index 37018bc55810..7d9bf4673e2d 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
> > @@ -1045,12 +1045,28 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
> > }
> > #endif
> >
> > -void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
> > +void etnaviv_gpu_recover_hang(struct etnaviv_gem_submit *submit)
> > {
> > + struct etnaviv_gpu *gpu = submit->gpu;
> > + char *comm = NULL, *cmd = NULL;
> > + struct task_struct *task;
> > unsigned int i;
> >
> > dev_err(gpu->dev, "recover hung GPU!\n");
> >
> > + task = get_pid_task(submit->pid, PIDTYPE_PID);
> > + if (task) {
> > + comm = kstrdup(task->comm, GFP_KERNEL);
> > + cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);
> > + put_task_struct(task);
> > + }
> > +
> > + if (comm && cmd)
> > + dev_err(gpu->dev, "offending task: %s (%s)\n", comm, cmd);
> > +
> > + kfree(cmd);
> > + kfree(comm);
> > +
> > if (pm_runtime_get_sync(gpu->dev) < 0)
> > goto pm_put;
> >
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
> > index 85eddd492774..b3a0941d56fd 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
> > @@ -168,7 +168,7 @@ bool etnaviv_fill_identity_from_hwdb(struct etnaviv_gpu *gpu);
> > int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m);
> > #endif
> >
> > -void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu);
> > +void etnaviv_gpu_recover_hang(struct etnaviv_gem_submit *submit);
> > void etnaviv_gpu_retire(struct etnaviv_gpu *gpu);
> > int etnaviv_gpu_wait_fence_interruptible(struct etnaviv_gpu *gpu,
> > u32 fence, struct drm_etnaviv_timespec *timeout);
> > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> > index 72e2553fbc98..d29f467eee13 100644
> > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> > @@ -67,7 +67,7 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
> >
> > /* get the GPU back into the init state */
> > etnaviv_core_dump(submit);
> > - etnaviv_gpu_recover_hang(gpu);
> > + etnaviv_gpu_recover_hang(submit);
> >
> > drm_sched_resubmit_jobs(&gpu->sched);
> >
>
>