[PATCH v4 2/2] drm/amdgpu: Create an option to disable soft recovery

From: André Almeida
Date: Mon Sep 11 2023 - 17:37:56 EST


Create a module option to disable soft recoveries on amdgpu, making
every recovery go through the device reset path. This option makes
easier to force device resets for testing and debugging purposes.

Signed-off-by: André Almeida <andrealmeid@xxxxxxxxxx>
Reviewed-by: Christian König <christian.koenig@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 7 +++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 7 ++++++-
3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 37eb9b3790a0..f30490abb3fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1103,6 +1103,7 @@ struct amdgpu_device {
/* Debug */
bool debug_vm;
bool debug_largebar;
+ bool debug_disable_soft_recovery;
};

static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 830146bd61c0..3ab7eac131e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -124,6 +124,7 @@
enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_VM = BIT(0),
AMDGPU_DEBUG_LARGEBAR = BIT(1),
+ AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
};

unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -945,6 +946,7 @@ MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics
* - 0x2: Enable simulating large-bar capability on non-large bar system. This
* limits the VRAM size reported to ROCm applications to the visible
* size, usually 256MB.
+ * - 0x4: Disable GPU soft recovery, always do a full reset
*/
MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
@@ -2064,6 +2066,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: enabled simulating large-bar capability on non-large bar system\n");
adev->debug_largebar = true;
}
+
+ if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) {
+ pr_info("debug: soft reset for GPU recovery disabled\n");
+ adev->debug_disable_soft_recovery = true;
+ }
}

static int amdgpu_pci_probe(struct pci_dev *pdev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index da26c555af24..231d49132a56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -433,7 +433,12 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
struct dma_fence *fence)
{
- ktime_t deadline = ktime_add_us(ktime_get(), 10000);
+ ktime_t deadline;
+
+ if (unlikely(ring->adev->debug_disable_soft_recovery))
+ return false;
+
+ deadline = ktime_add_us(ktime_get(), 10000);

if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
return false;
--
2.42.0