[PATCH v6 6/6] drm/amdgpu: Wire up dmem cgroup reclaim for VRAM manager
From: Thomas Hellström
Date: Thu Jun 11 2026 - 13:37:48 EST
Register the VRAM manager with the dmem cgroup reclaim infrastructure
so that lowering dmem.max below current VRAM usage triggers TTM
eviction rather than failing with -EBUSY.
Guard place->flags in amdgpu_ttm_bo_eviction_valuable() against NULL,
as the TTM reclaim path passes a NULL place in cgroup drain mode.
v3:
- Rebased on fix for uninitialized list and buddy allocator on the
drmm_cgroup_register_region() error path.
v5:
- Rebased on the introduction of struct dmem_cgroup_init.
- Clear the reclaim callback in amdgpu_vram_mgr_fini() to prevent
use-after-free if cgroup reclaim is triggered after driver unbind
while userspace holds an open DRM file descriptor. (Sashiko-bot)
- Switch from drmm_cgroup_register_region() to the raw
dmem_cgroup_register_region() and store the region in
amdgpu_vram_mgr.cg_region. Call dmem_cgroup_unregister_region()
in amdgpu_vram_mgr_fini() after ttm_resource_manager_evict_all()
to drain in-flight reclaim callbacks, and clear man->cg afterwards.
This is required because amdgpu's vram manager fini is called
explicitly during driver unbind, which may precede the DRM device
release and thus precede any drmm-based cleanup. (Sashiko-bot)
v6:
- Fix mgr->cg_region never being assigned, so
dmem_cgroup_unregister_region() in fini silently no-ops on NULL
and leaks the region. (Sashiko-bot)
- Reorder fini to call set_used(false) and evict_all() before
dmem_cgroup_unregister_region(), so ttm_resource_free() can
uncharge via man->cg during eviction; clear man->cg after
unregister. (Sashiko-bot)
Assisted-by: GitHub_Copilot:claude-sonnet-4.6
Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 31 ++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 2 ++
3 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 2740de94e93c..8cbcd33f51a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1488,7 +1488,7 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
dma_resv_for_each_fence(&resv_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
if (amdkfd_fence_check_mm(f, current->mm) &&
- !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
+ !(place && (place->flags & TTM_PL_FLAG_CONTIGUOUS)))
return false;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 08f05c3aed1d..2250bab0970d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -906,6 +906,10 @@ static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
.debug = amdgpu_vram_mgr_debug
};
+static const struct dmem_cgroup_ops amdgpu_vram_mgr_dmem_ops = {
+ .reclaim = ttm_resource_manager_dmem_reclaim,
+};
+
/**
* amdgpu_vram_mgr_init - init VRAM manager and DRM MM
*
@@ -917,6 +921,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
{
struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
struct ttm_resource_manager *man = &mgr->manager;
+ struct dmem_cgroup_region *cg;
int err;
ttm_resource_manager_init(man, &adev->mman.bdev,
@@ -933,12 +938,16 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
if (err)
return err;
- man->cg = drmm_cgroup_register_region(adev_to_drm(adev), "vram",
- &(struct dmem_cgroup_init){
- .size = adev->gmc.real_vram_size,
- });
- if (IS_ERR(man->cg))
- return PTR_ERR(man->cg);
+ cg = dmem_cgroup_register_region(&(struct dmem_cgroup_init){
+ .size = adev->gmc.real_vram_size,
+ .ops = &amdgpu_vram_mgr_dmem_ops,
+ .reclaim_priv = man,
+ }, "vram");
+ if (IS_ERR(cg))
+ return PTR_ERR(cg);
+
+ mgr->cg_region = cg;
+ ttm_resource_manager_set_dmem_region(man, cg);
ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_VRAM, &mgr->manager);
ttm_resource_manager_set_used(man, true);
@@ -966,6 +975,16 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;
+ /*
+ * Drain any in-flight dmem cgroup reclaim callbacks and remove the
+ * region from the global list. This must happen after evict_all()
+ * so that ttm_resource_free() can still uncharge via man->cg while
+ * BOs are being evicted.
+ */
+ dmem_cgroup_unregister_region(mgr->cg_region);
+ mgr->cg_region = NULL;
+ man->cg = NULL;
+
mutex_lock(&mgr->lock);
list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks)
kfree(rsv);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h
index 429a21a2e9b2..07103cddb335 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h
@@ -36,6 +36,8 @@ struct amdgpu_vram_mgr {
atomic64_t vis_usage;
u64 default_page_size;
struct list_head allocated_vres_list;
+ /** @cg_region: dmem cgroup region for VRAM; unregistered in fini. */
+ struct dmem_cgroup_region *cg_region;
};
struct amdgpu_vres_task {
--
2.54.0