Re: [PATCH v3] drm/sched: Fix kernel NULL pointer dereference error

From: Christian König
Date: Tue Oct 18 2022 - 08:35:12 EST



Am 18.10.22 um 14:20 schrieb Yadav, Arvind:
[SNIP]

+    drm_sched_fence_finished(s_fence);
+    dma_fence_put(&s_fence->finished);
+    wake_up_interruptible(&sched->wake_up_worker);
+}
+
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
+                  struct drm_sched_fence *s_fence)
+{
+    return dma_fence_add_callback(fence, &s_fence->cb,
+                      drm_sched_job_done_cb);
+}
+
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence)
+{
+    return dma_fence_remove_callback(s_fence->parent,
+                     &s_fence->cb);
+}

Do we really need separate functions for that?

We can use  'drm_sched_fence_set_parent' but we need to add extra NULL check before

adding in the callback otherwise add-callback will throw the warning message every time.

If I add NULL check then will never get any callback warning message for setting NULL parent fence.

So I have kept separate functions.

I rather prefer having a single function and allowing the parent fence to be set to NULL.

Alternatively we could have a drm_sched_fence_set_parent() and drm_sched_fence_clear_parent() function if you really think it's cleaner that way.

atomic_dec(&sched->hw_rq_count);
@@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
              continue;
            if (fence) {
-            r = dma_fence_add_callback(fence, &s_job->cb,
-                           drm_sched_job_done_cb);
+            r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence);
              if (r == -ENOENT)
-                drm_sched_job_done(s_job);
+                drm_sched_job_done(s_job->s_fence);
              else if (r)
                  DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",

Completely nuke that here. All of this should be done in the single drm_sched_fence_set_parent() function.

And an error message is completely superfluous. We just need to handle the case that the callback can't be installed because the fence is already signaled.

I will do the changes as per your review comments, Thank you for the review.

Just to clarify, you should nuke the error message. Error handling is rather pointless here.

Thanks,
Christian.


Thanks,

~Arvind

Regards,
Christian.

                        r);
          } else
-            drm_sched_job_done(s_job);
+            drm_sched_job_done(s_job->s_fence);
      }
        if (full_recovery) {
@@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param)
          drm_sched_fence_scheduled(s_fence);
            if (!IS_ERR_OR_NULL(fence)) {
-            s_fence->parent = dma_fence_get(fence);
-            /* Drop for original kref_init of the fence */
-            dma_fence_put(fence);
-
-            r = dma_fence_add_callback(fence, &sched_job->cb,
-                           drm_sched_job_done_cb);
+            r = drm_sched_fence_set_parent(fence, s_fence);
              if (r == -ENOENT)
-                drm_sched_job_done(sched_job);
+                drm_sched_job_done(s_fence);
              else if (r)
                  DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
                        r);
@@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param)
              if (IS_ERR(fence))
dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
  -            drm_sched_job_done(sched_job);
+            drm_sched_job_done(s_fence);
          }
            wake_up(&sched->job_scheduled);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 1f7d9dd1a444..7258e2fa195f 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -281,6 +281,10 @@ struct drm_sched_fence {
           * @owner: job owner for debugging
           */
      void                *owner;
+    /**
+     * @cb: callback
+     */
+    struct dma_fence_cb cb;
  };
    struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
@@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
   *         be scheduled further.
   * @s_priority: the priority of the job.
   * @entity: the entity to which this job belongs.
- * @cb: the callback for the parent fence in s_fence.
   *
   * A job is created by the driver using drm_sched_job_init(), and
   * should call drm_sched_entity_push_job() once it wants the scheduler
@@ -325,7 +328,6 @@ struct drm_sched_job {
      atomic_t            karma;
      enum drm_sched_priority        s_priority;
      struct drm_sched_entity         *entity;
-    struct dma_fence_cb        cb;
      /**
       * @dependencies:
       *
@@ -559,6 +561,12 @@ void drm_sched_fence_free(struct drm_sched_fence *fence);
  void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
  void drm_sched_fence_finished(struct drm_sched_fence *fence);
  +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
+                  struct drm_sched_fence *s_fence);
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence);
+int drm_sched_fence_set_parent(struct dma_fence *fence,
+                   struct drm_sched_fence *s_fence);
+
  unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
  void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
                          unsigned long remaining);