Re: [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects
From: Thomas Hellström
Date: Wed Aug 17 2022 - 03:35:48 EST
On Wed, 2022-08-17 at 09:55 +0300, Sviatoslav Peleshko wrote:
> The i915_gem_object_trylock we had in the grab_vma() makes it return
> false
> when the vma->obj is already locked. In this case we'll skip this vma
> during eviction, and eventually might be forced to return -ENOSPC
> even
> though we could've evicted this vma if we waited for the lock a bit.
>
> To fix this, replace the i915_gem_object_trylock with
> i915_gem_object_lock.
> And because we have to worry about the potential deadlock now,
> bubble-up
> the error code, so it will be correctly handled by the WW mechanism.
>
> This fixes the issue
> https://gitlab.freedesktop.org/drm/intel/-/issues/6564
>
> Fixes: 7e00897be8bf ("drm/i915: Add object locking to
> i915_gem_evict_for_node and i915_gem_evict_something, v2.")
> Signed-off-by: Sviatoslav Peleshko
> <sviatoslav.peleshko@xxxxxxxxxxxxxxx>
> ---
> drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++-------
> --
> 1 file changed, 46 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
> b/drivers/gpu/drm/i915/i915_gem_evict.c
> index f025ee4fa526..9d43f213f68f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
> return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
> }
>
> -static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> +static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> {
> + int ret = 0;
> +
> /*
> * We add the extra refcount so the object doesn't drop to
> zero until
> * after ungrab_vma(), this way trylock is always paired with
> unlock.
> */
> if (i915_gem_object_get_rcu(vma->obj)) {
> - if (!i915_gem_object_trylock(vma->obj, ww)) {
> + ret = i915_gem_object_lock(vma->obj, ww);
Isn't the vm->mutex held here? If so, then this would be a lockdep
violation.
/Thomas
> + if (ret)
> i915_gem_object_put(vma->obj);
> - return false;
> - }
> } else {
> /* Dead objects don't need pins */
> atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
> }
>
> - return true;
> + return ret;
> }
>
> -static void ungrab_vma(struct i915_vma *vma)
> +static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> {
> if (dying_vma(vma))
> return;
>
> - i915_gem_object_unlock(vma->obj);
> + if (!ww)
> + i915_gem_object_unlock(vma->obj);
> +
> i915_gem_object_put(vma->obj);
> }
>
> -static bool
> +static int
> mark_free(struct drm_mm_scan *scan,
> struct i915_gem_ww_ctx *ww,
> struct i915_vma *vma,
> unsigned int flags,
> struct list_head *unwind)
> {
> + int err;
> +
> if (i915_vma_is_pinned(vma))
> - return false;
> + return -ENOSPC;
>
> - if (!grab_vma(vma, ww))
> - return false;
> + err = grab_vma(vma, ww);
> + if (err)
> + return err;
>
> list_add(&vma->evict_link, unwind);
> - return drm_mm_scan_add_block(scan, &vma->node);
> + if (!drm_mm_scan_add_block(scan, &vma->node))
> + return -ENOSPC;
> +
> + return 0;
> }
>
> static bool defer_evict(struct i915_vma *vma)
> @@ -150,6 +159,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> enum drm_mm_insert_mode mode;
> struct i915_vma *active;
> int ret;
> + int err = 0;
>
> lockdep_assert_held(&vm->mutex);
> trace_i915_gem_evict(vm, min_size, alignment, flags);
> @@ -210,17 +220,23 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> continue;
> }
>
> - if (mark_free(&scan, ww, vma, flags, &eviction_list))
> + err = mark_free(&scan, ww, vma, flags,
> &eviction_list);
> + if (!err)
> goto found;
> + if (err == -EDEADLK)
> + break;
> }
>
> /* Nothing found, clean up and bail out! */
> list_for_each_entry_safe(vma, next, &eviction_list,
> evict_link) {
> ret = drm_mm_scan_remove_block(&scan, &vma->node);
> BUG_ON(ret);
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
>
> + if (err == -EDEADLK)
> + return err;
> +
> /*
> * Can we unpin some objects such as idle hw contents,
> * or pending flips? But since only the GGTT has global
> entries
> @@ -267,7 +283,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> __i915_vma_pin(vma);
> } else {
> list_del(&vma->evict_link);
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
> }
>
> @@ -277,17 +293,21 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
> __i915_vma_unpin(vma);
> if (ret == 0)
> ret = __i915_vma_unbind(vma);
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
>
> while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
> vma = container_of(node, struct i915_vma, node);
>
> /* If we find any non-objects (!vma), we cannot evict
> them */
> - if (vma->node.color != I915_COLOR_UNEVICTABLE &&
> - grab_vma(vma, ww)) {
> - ret = __i915_vma_unbind(vma);
> - ungrab_vma(vma);
> + if (vma->node.color != I915_COLOR_UNEVICTABLE) {
> + ret = grab_vma(vma, ww);
> + if (!ret) {
> + ret = __i915_vma_unbind(vma);
> + ungrab_vma(vma, ww);
> + } else if (ret != -EDEADLK) {
> + ret = -ENOSPC;
> + }
> } else {
> ret = -ENOSPC;
> }
> @@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
> break;
> }
>
> - if (!grab_vma(vma, ww)) {
> - ret = -ENOSPC;
> + ret = grab_vma(vma, ww);
> + if (ret) {
> + if (ret != -EDEADLK)
> + ret = -ENOSPC;
> +
> break;
> }
>
> @@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
> if (ret == 0)
> ret = __i915_vma_unbind(vma);
>
> - ungrab_vma(vma);
> + ungrab_vma(vma, ww);
> }
>
> return ret;