[PATCH] drm/panfrost: Handle resetting on timeout better

From: Steven Price
Date: Fri Sep 27 2019 - 06:42:40 EST


Panfrost uses multiple schedulers (one for each slot, so 2 in reality),
and on a timeout has to stop all the schedulers to safely perform a
reset. However more than one scheduler can trigger a timeout at the same
time. This race condition results in jobs being freed while they are
still in use.

Modify drm_sched_stop() to call cancel_delayed_work_sync() when stopping
a different scheduler to the one belonging to the passed in job.
panfrost_job_timedout() is also modified to only allow one thread at a
time to handle the reset. Any subsequent threads simply return assuming
that the first thread will handle the situation.

Signed-off-by: Steven Price <steven.price@xxxxxxx>
---
drivers/gpu/drm/panfrost/panfrost_device.h | 2 ++
drivers/gpu/drm/panfrost/panfrost_job.c | 11 ++++++++++-
drivers/gpu/drm/scheduler/sched_main.c | 5 ++++-
3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index f503c566e99f..6441c7fba6c4 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -99,6 +99,8 @@ struct panfrost_device {
unsigned long cur_volt;
struct panfrost_devfreq_slot slot[NUM_JOB_SLOTS];
} devfreq;
+
+ bool is_resetting;
};

struct panfrost_mmu {
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 05c85f45a0de..1b2019e08b43 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -388,13 +388,21 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)

mutex_lock(&pfdev->reset_lock);

+ if (pfdev->is_resetting) {
+ mutex_unlock(&pfdev->reset_lock);
+ return;
+ }
+ pfdev->is_resetting = true;
+
+ mutex_unlock(&pfdev->reset_lock);
+
for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_stop(&pfdev->js->queue[i].sched, sched_job);

if (sched_job)
drm_sched_increase_karma(sched_job);

- /* panfrost_core_dump(pfdev); */
+ mutex_lock(&pfdev->reset_lock);

panfrost_devfreq_record_transition(pfdev, js);
panfrost_device_reset(pfdev);
@@ -406,6 +414,7 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_start(&pfdev->js->queue[i].sched, true);

+ pfdev->is_resetting = false;
mutex_unlock(&pfdev->reset_lock);
}

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 148468447ba9..bc6d1862ec8a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -415,7 +415,10 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
* this TDR finished and before the newly restarted jobs had a
* chance to complete.
*/
- cancel_delayed_work(&sched->work_tdr);
+ if (bad->sched != sched)
+ cancel_delayed_work_sync(&sched->work_tdr);
+ else
+ cancel_delayed_work(&sched->work_tdr);
}

EXPORT_SYMBOL(drm_sched_stop);
--
2.20.1