Re: [PATCH] drm/panfrost: Queue jobs on the hardware

From: Steven Price
Date: Wed Aug 21 2019 - 05:00:57 EST


On 20/08/2019 06:23, Tomeu Vizoso wrote:
> On 8/16/19 11:31 AM, Steven Price wrote:
>> The hardware has a set of '_NEXT' registers that can hold a second job
>> while the first is executing. Make use of these registers to enqueue a
>> second job per slot.
>
> I like this in principle, but upon some quick testing I found that Mesa
> is around 10% slower with this patch (when using the performance governor).
>
> There's also the question of how this affects the utilization
> calculation in the devfreq code.

Yes - as far as I can tell the devfreq code is already broken as it is
using a per-JS utilisation metric. I'll try to find some time to fix
that up as well before reposting.

Steve

> I will be trying to find time to understand why Mesa is slower and not
> faster, but TBH performance doesn't have top priority for me yet. Would
> be great if somebody else could look at it.
>
> Thanks,
>
> Tomeu
>
>> Signed-off-by: Steven Price <steven.price@xxxxxxx>
>> ---
>> Note that this is based on top of Rob Herring's "per FD address space"
>> patch[1].
>>
>> [1]
>> https://marc.info/?i=20190813150115.30338-1-robh%20()%20kernel%20!%20org
>>
>> Â drivers/gpu/drm/panfrost/panfrost_device.h |Â 4 +-
>> Â drivers/gpu/drm/panfrost/panfrost_job.cÂÂÂ | 76 ++++++++++++++++++----
>> Â drivers/gpu/drm/panfrost/panfrost_mmu.cÂÂÂ |Â 2 +-
>> Â 3 files changed, 67 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h
>> b/drivers/gpu/drm/panfrost/panfrost_device.h
>> index f503c566e99f..0153defd6085 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_device.h
>> +++ b/drivers/gpu/drm/panfrost/panfrost_device.h
>> @@ -55,7 +55,7 @@ struct panfrost_devfreq_slot {
>> ÂÂÂÂÂ ktime_t busy_time;
>> ÂÂÂÂÂ ktime_t idle_time;
>> ÂÂÂÂÂ ktime_t time_last_update;
>> -ÂÂÂ bool busy;
>> +ÂÂÂ int busy;
>> Â };
>> Â Â struct panfrost_device {
>> @@ -80,7 +80,7 @@ struct panfrost_device {
>> Â ÂÂÂÂÂ struct panfrost_job_slot *js;
>> Â -ÂÂÂ struct panfrost_job *jobs[NUM_JOB_SLOTS];
>> +ÂÂÂ struct panfrost_job *jobs[NUM_JOB_SLOTS][2];
>> ÂÂÂÂÂ struct list_head scheduled_jobs;
>> Â ÂÂÂÂÂ struct panfrost_perfcnt *perfcnt;
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>> index 05c85f45a0de..b2b5027af976 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>> @@ -138,6 +138,37 @@ static void panfrost_job_write_affinity(struct
>> panfrost_device *pfdev,
>> ÂÂÂÂÂ job_write(pfdev, JS_AFFINITY_NEXT_HI(js), affinity >> 32);
>> Â }
>> Â +static int panfrost_job_count(struct panfrost_device *pfdev, int slot)
>> +{
>> +ÂÂÂ if (pfdev->jobs[slot][0] == NULL)
>> +ÂÂÂÂÂÂÂ return 0;
>> +ÂÂÂ if (pfdev->jobs[slot][1] == NULL)
>> +ÂÂÂÂÂÂÂ return 1;
>> +ÂÂÂ return 2;
>> +}
>> +
>> +static struct panfrost_job *panfrost_dequeue_job(
>> +ÂÂÂÂÂÂÂ struct panfrost_device *pfdev, int slot)
>> +{
>> +ÂÂÂ struct panfrost_job *job = pfdev->jobs[slot][0];
>> +
>> +ÂÂÂ pfdev->jobs[slot][0] = pfdev->jobs[slot][1];
>> +ÂÂÂ pfdev->jobs[slot][1] = NULL;
>> +
>> +ÂÂÂ return job;
>> +}
>> +
>> +static void panfrost_enqueue_job(struct panfrost_device *pfdev, int
>> slot,
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct panfrost_job *job)
>> +{
>> +ÂÂÂ if (pfdev->jobs[slot][0] == NULL) {
>> +ÂÂÂÂÂÂÂ pfdev->jobs[slot][0] = job;
>> +ÂÂÂÂÂÂÂ return;
>> +ÂÂÂ }
>> +ÂÂÂ WARN_ON(pfdev->jobs[slot][1] != NULL);
>> +ÂÂÂ pfdev->jobs[slot][1] = job;
>> +}
>> +
>> Â static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
>> Â {
>> ÂÂÂÂÂ struct panfrost_device *pfdev = job->pfdev;
>> @@ -150,13 +181,16 @@ static void panfrost_job_hw_submit(struct
>> panfrost_job *job, int js)
>> ÂÂÂÂÂ if (ret < 0)
>> ÂÂÂÂÂÂÂÂÂ return;
>> Â -ÂÂÂ if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js))))
>> -ÂÂÂÂÂÂÂ goto end;
>> -
>> ÂÂÂÂÂ cfg = panfrost_mmu_as_get(pfdev, &job->file_priv->mmu);
>> Â -ÂÂÂ panfrost_devfreq_record_transition(pfdev, js);
>> ÂÂÂÂÂ spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
>> +ÂÂÂ panfrost_enqueue_job(pfdev, js, job);
>> +
>> +ÂÂÂ if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js))))
>> +ÂÂÂÂÂÂÂ goto end;
>> +
>> +ÂÂÂ if (panfrost_job_count(pfdev, js) == 1)
>> +ÂÂÂÂÂÂÂ panfrost_devfreq_record_transition(pfdev, js);
>> Â ÂÂÂÂÂ job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
>> ÂÂÂÂÂ job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
>> @@ -186,9 +220,9 @@ static void panfrost_job_hw_submit(struct
>> panfrost_job *job, int js)
>> Â ÂÂÂÂÂ job_write(pfdev, JS_COMMAND_NEXT(js), JS_COMMAND_START);
>> Â +end:
>> ÂÂÂÂÂ spin_unlock_irqrestore(&pfdev->hwaccess_lock, flags);
>> Â -end:
>> ÂÂÂÂÂ pm_runtime_mark_last_busy(pfdev->dev);
>> ÂÂÂÂÂ pm_runtime_put_autosuspend(pfdev->dev);
>> Â }
>> @@ -336,8 +370,6 @@ static struct dma_fence *panfrost_job_run(struct
>> drm_sched_job *sched_job)
>> ÂÂÂÂÂ if (unlikely(job->base.s_fence->finished.error))
>> ÂÂÂÂÂÂÂÂÂ return NULL;
>> Â -ÂÂÂ pfdev->jobs[slot] = job;
>> -
>> ÂÂÂÂÂ fence = panfrost_fence_create(pfdev, slot);
>> ÂÂÂÂÂ if (IS_ERR(fence))
>> ÂÂÂÂÂÂÂÂÂ return NULL;
>> @@ -421,21 +453,36 @@ static irqreturn_t panfrost_job_irq_handler(int
>> irq, void *data)
>> ÂÂÂÂÂ struct panfrost_device *pfdev = data;
>> ÂÂÂÂÂ u32 status = job_read(pfdev, JOB_INT_STAT);
>> ÂÂÂÂÂ int j;
>> +ÂÂÂ unsigned long flags;
>> Â ÂÂÂÂÂ dev_dbg(pfdev->dev, "jobslot irq status=%x\n", status);
>> Â ÂÂÂÂÂ if (!status)
>> ÂÂÂÂÂÂÂÂÂ return IRQ_NONE;
>> Â +ÂÂÂ spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
>> +
>> ÂÂÂÂÂ pm_runtime_mark_last_busy(pfdev->dev);
>> Â ÂÂÂÂÂ for (j = 0; status; j++) {
>> ÂÂÂÂÂÂÂÂÂ u32 mask = MK_JS_MASK(j);
>> +ÂÂÂÂÂÂÂ int jobs = panfrost_job_count(pfdev, j);
>> +ÂÂÂÂÂÂÂ int active;
>> Â ÂÂÂÂÂÂÂÂÂ if (!(status & mask))
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ continue;
>> Â ÂÂÂÂÂÂÂÂÂ job_write(pfdev, JOB_INT_CLEAR, mask);
>> +ÂÂÂÂÂÂÂ active = (job_read(pfdev, JOB_INT_JS_STATE) &
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂ JOB_INT_MASK_DONE(j)) ? 1 : 0;
>> +
>> +ÂÂÂÂÂÂÂ if (!(status & JOB_INT_MASK_ERR(j))) {
>> +ÂÂÂÂÂÂÂÂÂÂÂ /* Recheck RAWSTAT to check if there's a newly
>> +ÂÂÂÂÂÂÂÂÂÂÂÂ * failed job (since JOB_INT_STAT was read)
>> +ÂÂÂÂÂÂÂÂÂÂÂÂ */
>> +ÂÂÂÂÂÂÂÂÂÂÂ status |= job_read(pfdev, JOB_INT_RAWSTAT) &
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ JOB_INT_MASK_ERR(j);
>> +ÂÂÂÂÂÂÂ }
>> Â ÂÂÂÂÂÂÂÂÂ if (status & JOB_INT_MASK_ERR(j)) {
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);
>> @@ -447,20 +494,25 @@ static irqreturn_t panfrost_job_irq_handler(int
>> irq, void *data)
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ job_read(pfdev, JS_TAIL_LO(j)));
>> Â ÂÂÂÂÂÂÂÂÂÂÂÂÂ drm_sched_fault(&pfdev->js->queue[j].sched);
>> +ÂÂÂÂÂÂÂÂÂÂÂ jobs --;
>> ÂÂÂÂÂÂÂÂÂ }
>> Â -ÂÂÂÂÂÂÂ if (status & JOB_INT_MASK_DONE(j)) {
>> -ÂÂÂÂÂÂÂÂÂÂÂ struct panfrost_job *job = pfdev->jobs[j];
>> +ÂÂÂÂÂÂÂ while (jobs -- > active) {
>> +ÂÂÂÂÂÂÂÂÂÂÂ struct panfrost_job *job =
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ panfrost_dequeue_job(pfdev, j);
>> Â -ÂÂÂÂÂÂÂÂÂÂÂ pfdev->jobs[j] = NULL;
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ panfrost_mmu_as_put(pfdev, &job->file_priv->mmu);
>> -ÂÂÂÂÂÂÂÂÂÂÂ panfrost_devfreq_record_transition(pfdev, j);
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ dma_fence_signal(job->done_fence);
>> ÂÂÂÂÂÂÂÂÂ }
>> Â +ÂÂÂÂÂÂÂ if (!active)
>> +ÂÂÂÂÂÂÂÂÂÂÂ panfrost_devfreq_record_transition(pfdev, j);
>> +
>> ÂÂÂÂÂÂÂÂÂ status &= ~mask;
>> ÂÂÂÂÂ }
>> Â +ÂÂÂ spin_unlock_irqrestore(&pfdev->hwaccess_lock, flags);
>> +
>> ÂÂÂÂÂ return IRQ_HANDLED;
>> Â }
>> Â @@ -491,7 +543,7 @@ int panfrost_job_init(struct panfrost_device
>> *pfdev)
>> Â ÂÂÂÂÂÂÂÂÂ ret = drm_sched_init(&js->queue[j].sched,
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ &panfrost_sched_ops,
>> -ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ 1, 0, msecs_to_jiffies(500),
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ 2, 0, msecs_to_jiffies(500),
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ "pan_js");
>> ÂÂÂÂÂÂÂÂÂ if (ret) {
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>> ret);
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c
>> b/drivers/gpu/drm/panfrost/panfrost_mmu.c
>> index f22d8f02568d..c25fd88ef437 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
>> +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
>> @@ -147,7 +147,7 @@ u32 panfrost_mmu_as_get(struct panfrost_device
>> *pfdev, struct panfrost_mmu *mmu)
>> ÂÂÂÂÂ as = mmu->as;
>> ÂÂÂÂÂ if (as >= 0) {
>> ÂÂÂÂÂÂÂÂÂ int en = atomic_inc_return(&mmu->as_count);
>> -ÂÂÂÂÂÂÂ WARN_ON(en >= NUM_JOB_SLOTS);
>> +ÂÂÂÂÂÂÂ WARN_ON(en >= NUM_JOB_SLOTS*2);
>> Â ÂÂÂÂÂÂÂÂÂ list_move(&mmu->list, &pfdev->as_lru_list);
>> ÂÂÂÂÂÂÂÂÂ goto out;
>>
> _______________________________________________
> dri-devel mailing list
> dri-devel@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/dri-devel