[PATCH 6.19 138/844] drm/amdkfd: Handle GPU reset and drain retry fault race
From: Sasha Levin
Date: Sat Feb 28 2026 - 13:01:11 EST
From: Philip Yang <Philip.Yang@xxxxxxx>
[ Upstream commit 5b57c3c3f22336e8fd5edb7f0fef3c7823f8eac1 ]
Only check and drain IH1 ring if CAM is not enabled.
If GPU is under reset, don't access IH to drain retry fault.
Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx>
Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 79ea138897fcf..a10cf8650c92b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -33,6 +33,7 @@
#include "amdgpu_hmm.h"
#include "amdgpu.h"
#include "amdgpu_xgmi.h"
+#include "amdgpu_reset.h"
#include "kfd_priv.h"
#include "kfd_svm.h"
#include "kfd_migrate.h"
@@ -2349,6 +2350,9 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
+ if (!down_read_trylock(&pdd->dev->adev->reset_domain->sem))
+ continue;
+
amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
pdd->dev->adev->irq.retry_cam_enabled ?
&pdd->dev->adev->irq.ih :
@@ -2358,6 +2362,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
&pdd->dev->adev->irq.ih_soft);
+ up_read(&pdd->dev->adev->reset_domain->sem);
pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
}
@@ -2541,7 +2546,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
adev = pdd->dev->adev;
/* Check and drain ih1 ring if cam not available */
- if (adev->irq.ih1.ring_size) {
+ if (!adev->irq.retry_cam_enabled && adev->irq.ih1.ring_size) {
ih = &adev->irq.ih1;
checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
if (ih->rptr != checkpoint_wptr) {
--
2.51.0