[RFC PATCH] drm/sched: Fix teardown leaks with refcounting

From: Philipp Stanner
Date: Tue Sep 03 2024 - 05:46:39 EST


The GPU scheduler currently does not ensure that its pending_list is
empty before performing various other teardown tasks in
drm_sched_fini().

If there are still jobs in the pending_list, this is problematic because
after scheduler teardown, no one will call backend_ops.free_job()
anymore. This would, consequently, result in memory leaks.

One way to solves this is to implement reference counting for struct
drm_gpu_scheduler itself. Each job added to the pending_list takes a
reference, each one removed drops a reference.

This approach would keep the scheduler running even after users have
called drm_sched_fini(), and it would ultimately stop after the last job
has been removed from pending_list.

Add reference counting to struct drm_gpu_scheduler. Move the teardown
logic to __drm_sched_fini() and have drm_sched_fini() just also drop a
reference.

Suggested-by: Danilo Krummrich <dakr@xxxxxxxxxx>
Signed-off-by: Philipp Stanner <pstanner@xxxxxxxxxx>
---
Hi all,

since the scheduler has many stake holders, I want this solution
discussed as an RFC first.

The advantage of this version would be that it does not block
drm_sched_fini(), but the price paid for that is that the scheduler
keeps running until all jobs are gone.

Cheers,
P.
---
drivers/gpu/drm/scheduler/sched_main.c | 17 ++++++++++++++++-
include/drm/gpu_scheduler.h | 5 +++++
2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 7e90c9f95611..62b453c8ed76 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -99,6 +99,8 @@ int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
module_param_named(sched_policy, drm_sched_policy, int, 0444);

+static void __drm_sched_fini(struct kref *jobs_remaining);
+
static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
{
u32 credits;
@@ -540,6 +542,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)

spin_lock(&sched->job_list_lock);
list_add_tail(&s_job->list, &sched->pending_list);
+ kref_get(&sched->jobs_remaining);
drm_sched_start_timeout(sched);
spin_unlock(&sched->job_list_lock);
}
@@ -564,6 +567,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
* is parked at which point it's safe.
*/
list_del_init(&job->list);
+ kref_put(&sched->jobs_remaining, __drm_sched_fini);
spin_unlock(&sched->job_list_lock);

status = job->sched->ops->timedout_job(job);
@@ -637,6 +641,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
*/
spin_lock(&sched->job_list_lock);
list_del_init(&s_job->list);
+ kref_put(&sched->jobs_remaining, __drm_sched_fini);
spin_unlock(&sched->job_list_lock);

/*
@@ -1084,6 +1089,7 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched)
if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
/* remove job from pending_list */
list_del_init(&job->list);
+ kref_put(&sched->jobs_remaining, __drm_sched_fini);

/* cancel this job's TO timer */
cancel_delayed_work(&sched->work_tdr);
@@ -1303,6 +1309,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
init_waitqueue_head(&sched->job_scheduled);
INIT_LIST_HEAD(&sched->pending_list);
spin_lock_init(&sched->job_list_lock);
+ kref_init(&sched->jobs_remaining);
atomic_set(&sched->credit_count, 0);
INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
@@ -1334,11 +1341,14 @@ EXPORT_SYMBOL(drm_sched_init);
*
* Tears down and cleans up the scheduler.
*/
-void drm_sched_fini(struct drm_gpu_scheduler *sched)
+static void __drm_sched_fini(struct kref *jobs_remaining)
{
+ struct drm_gpu_scheduler *sched;
struct drm_sched_entity *s_entity;
int i;

+ sched = container_of(jobs_remaining, struct drm_gpu_scheduler, jobs_remaining);
+
drm_sched_wqueue_stop(sched);

for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
@@ -1368,6 +1378,11 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
kfree(sched->sched_rq);
sched->sched_rq = NULL;
}
+
+void drm_sched_fini(struct drm_gpu_scheduler *sched)
+{
+ kref_put(&sched->jobs_remaining, __drm_sched_fini);
+}
EXPORT_SYMBOL(drm_sched_fini);

/**
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 5acc64954a88..b7fad8294861 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -29,6 +29,7 @@
#include <linux/completion.h>
#include <linux/xarray.h>
#include <linux/workqueue.h>
+#include <linux/kref.h>

#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)

@@ -495,6 +496,9 @@ struct drm_sched_backend_ops {
* waits on this wait queue until all the scheduled jobs are
* finished.
* @job_id_count: used to assign unique id to the each job.
+ * @jobs_remaining: submitted jobs take a refcount of the scheduler to prevent it
+ * from terminating before the last job has been removed from
+ * @pending_list.
* @submit_wq: workqueue used to queue @work_run_job and @work_free_job
* @timeout_wq: workqueue used to queue @work_tdr
* @work_run_job: work which calls run_job op of each scheduler.
@@ -525,6 +529,7 @@ struct drm_gpu_scheduler {
struct drm_sched_rq **sched_rq;
wait_queue_head_t job_scheduled;
atomic64_t job_id_count;
+ struct kref jobs_remaining;
struct workqueue_struct *submit_wq;
struct workqueue_struct *timeout_wq;
struct work_struct work_run_job;
--
2.46.0