Re: [PATCH v2 1/3] drm/panthor: introduce job cycle and timestamp accounting

From: Liviu Dudau
Date: Wed Apr 24 2024 - 11:49:09 EST


Hi Adrián,

On Tue, Apr 23, 2024 at 10:32:34PM +0100, Adrián Larumbe wrote:
> Enable calculations of job submission times in clock cycles and wall
> time. This is done by expanding the boilerplate command stream when running
> a job to include instructions that compute said times right before an after
> a user CS.
>
> Those numbers are stored in the queue's group's sync objects BO, right
> after them. Because the queues in a group might have a different number of
> slots, one must keep track of the overall slot tally when reckoning the
> offset of a queue's time sample structs, one for each slot.
>
> NUM_INSTRS_PER_SLOT had to be increased to 32 because of adding new FW
> instructions for storing and subtracting the cycle counter and timestamp
> register, and it must always remain a power of two.
>
> This commit is done in preparation for enabling DRM fdinfo support in the
> Panthor driver, which depends on the numbers calculated herein.
>
> Signed-off-by: Adrián Larumbe <adrian.larumbe@xxxxxxxxxxxxx>
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 158 ++++++++++++++++++++----
> 1 file changed, 134 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index b3a51a6de523..320dfa0388ba 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -93,6 +93,9 @@
> #define MIN_CSGS 3
> #define MAX_CSG_PRIO 0xf
>
> +#define NUM_INSTRS_PER_SLOT 32
> +#define SLOTSIZE (NUM_INSTRS_PER_SLOT * sizeof(u64))
> +
> struct panthor_group;
>
> /**
> @@ -466,6 +469,9 @@ struct panthor_queue {
> */
> struct list_head in_flight_jobs;
> } fence_ctx;
> +
> + /** @time_offset: Offset of panthor_job_times structs in group's syncobj bo. */
> + unsigned long time_offset;

Minor nitpick: I would call it job_times_offset. Or stats_offset. One letter difference
versus syncobjs' times_offset gets harder to read sometimes.

> };
>
> /**
> @@ -580,7 +586,17 @@ struct panthor_group {
> * One sync object per queue. The position of the sync object is
> * determined by the queue index.
> */
> - struct panthor_kernel_bo *syncobjs;
> +
> + struct {
> + /** @bo: Kernel BO holding the sync objects. */
> + struct panthor_kernel_bo *bo;
> +
> + /**
> + * @times_offset: Beginning of panthor_job_times struct samples after
> + * the group's array of sync objects.
> + */
> + size_t times_offset;
> + } syncobjs;
>
> /** @state: Group state. */
> enum panthor_group_state state;
> @@ -639,6 +655,18 @@ struct panthor_group {
> struct list_head wait_node;
> };
>
> +struct panthor_job_times {
> + struct {
> + u64 before;
> + u64 after;
> + } cycles;
> +
> + struct {
> + u64 before;
> + u64 after;
> + } time;
> +};
> +
> /**
> * group_queue_work() - Queue a group work
> * @group: Group to queue the work for.
> @@ -718,6 +746,9 @@ struct panthor_job {
> /** @queue_idx: Index of the queue inside @group. */
> u32 queue_idx;
>
> + /** @ringbuf_idx: Index of the ringbuffer inside @queue. */
> + u32 ringbuf_idx;
> +
> /** @call_info: Information about the userspace command stream call. */
> struct {
> /** @start: GPU address of the userspace command stream. */
> @@ -833,7 +864,7 @@ static void group_release_work(struct work_struct *work)
>
> panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->suspend_buf);
> panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->protm_suspend_buf);
> - panthor_kernel_bo_destroy(group->vm, group->syncobjs);
> + panthor_kernel_bo_destroy(group->vm, group->syncobjs.bo);
>
> panthor_vm_put(group->vm);
> kfree(group);
> @@ -1924,8 +1955,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> }
> }
>
> -#define NUM_INSTRS_PER_SLOT 16
> -
> static void
> group_term_post_processing(struct panthor_group *group)
> {
> @@ -1962,7 +1991,7 @@ group_term_post_processing(struct panthor_group *group)
> spin_unlock(&queue->fence_ctx.lock);
>
> /* Manually update the syncobj seqno to unblock waiters. */
> - syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
> + syncobj = group->syncobjs.bo->kmap + (i * sizeof(*syncobj));
> syncobj->status = ~0;
> syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno);
> sched_queue_work(group->ptdev->scheduler, sync_upd);
> @@ -2729,7 +2758,7 @@ static void group_sync_upd_work(struct work_struct *work)
> if (!queue)
> continue;
>
> - syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj));
> + syncobj = group->syncobjs.bo->kmap + (queue_idx * sizeof(*syncobj));
>
> spin_lock(&queue->fence_ctx.lock);
> list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> @@ -2764,15 +2793,23 @@ queue_run_job(struct drm_sched_job *sched_job)
> struct panthor_scheduler *sched = ptdev->scheduler;
> u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> + u32 ringbuf_index = ringbuf_insert / (SLOTSIZE);
> u64 addr_reg = ptdev->csif_info.cs_reg_count -
> ptdev->csif_info.unpreserved_cs_reg_count;
> u64 val_reg = addr_reg + 2;
> - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> - job->queue_idx * sizeof(struct panthor_syncobj_64b);
> + u64 cycle_reg = addr_reg;
> + u64 time_reg = val_reg;
> + u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) +
> + job->queue_idx * sizeof(struct panthor_syncobj_64b);
> + u64 times_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) + queue->time_offset +
> + (ringbuf_index * sizeof(struct panthor_job_times));
> +
> u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> struct dma_fence *done_fence;
> int ret;
>
> + drm_WARN_ON(&ptdev->base, ringbuf_insert >= ringbuf_size);

I think we should return an error here, rather than warn. What would be the point to continue?

> +
> u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> /* MOV32 rX+2, cs.latest_flush */
> (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> @@ -2780,6 +2817,18 @@ queue_run_job(struct drm_sched_job *sched_job)
> /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
>
> + /* MOV48 rX:rX+1, cycles_offset */
> + (1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.before)),
> +
> + /* MOV48 rX:rX+1, time_offset */
> + (1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.before)),
> +
> + /* STORE_STATE cycles */
> + (40ull << 56) | (cycle_reg << 40) | (1ll << 32),
> +
> + /* STORE_STATE timer */
> + (40ull << 56) | (time_reg << 40) | (0ll << 32),
> +
> /* MOV48 rX:rX+1, cs.start */
> (1ull << 56) | (addr_reg << 48) | job->call_info.start,
>
> @@ -2792,6 +2841,18 @@ queue_run_job(struct drm_sched_job *sched_job)
> /* CALL rX:rX+1, rX+2 */
> (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
>
> + /* MOV48 rX:rX+1, cycles_offset */
> + (1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.after)),
> +
> + /* MOV48 rX:rX+1, time_offset */
> + (1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.after)),
> +
> + /* STORE_STATE cycles */
> + (40ull << 56) | (cycle_reg << 40) | (1ll << 32),
> +
> + /* STORE_STATE timer */
> + (40ull << 56) | (time_reg << 40) | (0ll << 32),
> +
> /* MOV48 rX:rX+1, sync_addr */
> (1ull << 56) | (addr_reg << 48) | sync_addr,
>
> @@ -2846,6 +2907,7 @@ queue_run_job(struct drm_sched_job *sched_job)
>
> job->ringbuf.start = queue->iface.input->insert;
> job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> + job->ringbuf_idx = ringbuf_index;
>
> /* Make sure the ring buffer is updated before the INSERT
> * register.
> @@ -2936,7 +2998,8 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
>
> static struct panthor_queue *
> group_create_queue(struct panthor_group *group,
> - const struct drm_panthor_queue_create *args)
> + const struct drm_panthor_queue_create *args,
> + unsigned int slots_so_far)
> {
> struct drm_gpu_scheduler *drm_sched;
> struct panthor_queue *queue;
> @@ -2987,9 +3050,12 @@ group_create_queue(struct panthor_group *group,
> goto err_free_queue;
> }
>
> + queue->time_offset = group->syncobjs.times_offset +
> + (slots_so_far * sizeof(struct panthor_job_times));
> +
> ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
> group->ptdev->scheduler->wq, 1,
> - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> + args->ringbuf_size / SLOTSIZE,
> 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> group->ptdev->reset.wq,
> NULL, "panthor-queue", group->ptdev->base.dev);
> @@ -3017,7 +3083,9 @@ int panthor_group_create(struct panthor_file *pfile,
> struct panthor_scheduler *sched = ptdev->scheduler;
> struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
> struct panthor_group *group = NULL;
> + unsigned int total_slots;
> u32 gid, i, suspend_size;
> + size_t syncobj_bo_size;
> int ret;
>
> if (group_args->pad)
> @@ -3083,33 +3151,75 @@ int panthor_group_create(struct panthor_file *pfile,
> goto err_put_group;
> }
>
> - group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm,
> - group_args->queues.count *
> - sizeof(struct panthor_syncobj_64b),
> - DRM_PANTHOR_BO_NO_MMAP,
> - DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> - DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> - PANTHOR_VM_KERNEL_AUTO_VA);
> - if (IS_ERR(group->syncobjs)) {
> - ret = PTR_ERR(group->syncobjs);
> + /*
> + * Need to add size for the panthor_job_times structs, as many as the sum
> + * of the number of job slots for every single queue ringbuffer.
> + */
> + for (i = 0, total_slots = 0; i < group_args->queues.count; i++)
> + total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
> +
> + syncobj_bo_size = (group_args->queues.count * sizeof(struct panthor_syncobj_64b))
> + + (total_slots * sizeof(struct panthor_job_times));
> +
> + /*
> + * Memory layout of group's syncobjs BO
> + * group->syncobjs.bo {
> + * struct panthor_syncobj_64b sync1;
> + * struct panthor_syncobj_64b sync2;
> + * ...
> + * As many as group_args->queues.count
> + * ...
> + * struct panthor_syncobj_64b syncn;
> + * struct panthor_job_times queue1_slot1
> + * struct panthor_job_times queue1_slot2
> + * ...
> + * As many as queue[i].ringbuf_size / SLOTSIZE
> + * ...
> + * struct panthor_job_times queue1_slotP
> + * ...
> + * As many as group_args->queues.count
> + * ...
> + * struct panthor_job_times queueN_slot1
> + * struct panthor_job_times queueN_slot2
> + * ...
> + * As many as queue[n].ringbuf_size / SLOTSIZE
> + * struct panthor_job_times queueN_slotQ
> + *
> + * Linearly, group->syncobjs.bo = {syncojb1,..,syncobjN,
> + * {queue1 = {js1,..,jsP},..,queueN = {js1,..,jsQ}}}
> + * }
> + *
> + */
> +
> + group->syncobjs.bo = panthor_kernel_bo_create(ptdev, group->vm,
> + syncobj_bo_size,
> + DRM_PANTHOR_BO_NO_MMAP,
> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> + PANTHOR_VM_KERNEL_AUTO_VA);
> + if (IS_ERR(group->syncobjs.bo)) {
> + ret = PTR_ERR(group->syncobjs.bo);
> goto err_put_group;
> }
>
> - ret = panthor_kernel_bo_vmap(group->syncobjs);
> + ret = panthor_kernel_bo_vmap(group->syncobjs.bo);
> if (ret)
> goto err_put_group;
>
> - memset(group->syncobjs->kmap, 0,
> - group_args->queues.count * sizeof(struct panthor_syncobj_64b));
> + memset(group->syncobjs.bo->kmap, 0, syncobj_bo_size);
> +
> + group->syncobjs.times_offset =
> + group_args->queues.count * sizeof(struct panthor_syncobj_64b);
>
> - for (i = 0; i < group_args->queues.count; i++) {
> - group->queues[i] = group_create_queue(group, &queue_args[i]);
> + for (i = 0, total_slots = 0; i < group_args->queues.count; i++) {
> + group->queues[i] = group_create_queue(group, &queue_args[i], total_slots);
> if (IS_ERR(group->queues[i])) {
> ret = PTR_ERR(group->queues[i]);
> group->queues[i] = NULL;
> goto err_put_group;
> }
>
> + total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
> group->queue_count++;
> }
>
> --
> 2.44.0
>

With those comments addressed,
Reviewed-by: Liviu Dudau <liviu.dudau@xxxxxxx>

Best regards,
Liviu


--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯