[PATCH v0] kernel: resource: add devm_request_free_mem_region_from_end()
From: Bert Karwatzki
Date: Thu Mar 27 2025 - 08:03:37 EST
devm_request_free_mem_region() places resources at the end of the
physical address space, DIRECT_MAP_PHYSMEM_END. If the the dma mask
of a pci device is smaller than DIRECT_MAP_PHSYMEM_END then this
resource is not dma accessible by the device which can cause the
device to fallback to using 32bit dma which can lead to severe
performance impacts.
This adds the devm_request_free_mem_region_from_end() function which
allows to select the endpoint from which to place the resource
independently from DIRECT_MAP_PHYSMEM_END.
Link: https://lore.kernel.org/all/20250322122351.3268-1-spasswolf@xxxxxx/
Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 ++-
include/linux/ioport.h | 3 +++
kernel/resource.c | 31 ++++++++++++++++++------
3 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index d05d199b5e44..e1942fef3637 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -1042,7 +1042,8 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1;
pgmap->type = MEMORY_DEVICE_COHERENT;
} else {
- res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
+ res = devm_request_free_mem_region_from_end(adev->dev,
+ &iomem_resource, size, dma_get_mask(adev->dev));
if (IS_ERR(res))
return PTR_ERR(res);
pgmap->range.start = res->start;
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5385349f0b8a..a9a765721ab4 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -407,6 +407,9 @@ walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size);
+struct resource *devm_request_free_mem_region_from_end(struct device *dev,
+ struct resource *base, unsigned long size,
+ resource_size_t seek_end);
struct resource *request_free_mem_region(struct resource *base,
unsigned long size, const char *name);
struct resource *alloc_free_mem_region(struct resource *base,
diff --git a/kernel/resource.c b/kernel/resource.c
index 12004452d999..82f40407c02d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1875,12 +1875,14 @@ EXPORT_SYMBOL(resource_list_free);
#endif
static resource_size_t gfr_start(struct resource *base, resource_size_t size,
- resource_size_t align, unsigned long flags)
+ resource_size_t align, resource_size_t seek_end,
+ unsigned long flags)
{
if (flags & GFR_DESCENDING) {
resource_size_t end;
end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
+ end = min_t(resource_size_t, end, seek_end);
return end - size + 1;
}
@@ -1920,8 +1922,8 @@ static void remove_free_mem_region(void *_res)
static struct resource *
get_free_mem_region(struct device *dev, struct resource *base,
resource_size_t size, const unsigned long align,
- const char *name, const unsigned long desc,
- const unsigned long flags)
+ resource_size_t seek_end, const char *name,
+ const unsigned long desc, const unsigned long flags)
{
resource_size_t addr;
struct resource *res;
@@ -1946,7 +1948,7 @@ get_free_mem_region(struct device *dev, struct resource *base,
}
write_lock(&resource_lock);
- for (addr = gfr_start(base, size, align, flags);
+ for (addr = gfr_start(base, size, align, seek_end, flags);
gfr_continue(base, addr, align, flags);
addr = gfr_next(addr, align, flags)) {
if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
@@ -2021,17 +2023,30 @@ struct resource *devm_request_free_mem_region(struct device *dev,
unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
- dev_name(dev),
+ DIRECT_MAP_PHYSMEM_END, dev_name(dev),
IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+struct resource *devm_request_free_mem_region_from_end(struct device *dev,
+ struct resource *base, unsigned long size,
+ resource_size_t seek_end)
+{
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+ seek_end, dev_name(dev),
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region_from_end);
+
struct resource *request_free_mem_region(struct resource *base,
unsigned long size, const char *name)
{
unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
- return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+ return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN,
+ DIRECT_MAP_PHYSMEM_END, name,
IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
}
EXPORT_SYMBOL_GPL(request_free_mem_region);
@@ -2055,8 +2070,8 @@ struct resource *alloc_free_mem_region(struct resource *base,
/* Default of ascending direction and insert resource */
unsigned long flags = 0;
- return get_free_mem_region(NULL, base, size, align, name,
- IORES_DESC_NONE, flags);
+ return get_free_mem_region(NULL, base, size, align, DIRECT_MAP_PHYSMEM_END,
+ name, IORES_DESC_NONE, flags);
}
EXPORT_SYMBOL_GPL(alloc_free_mem_region);
#endif /* CONFIG_GET_FREE_REGION */
--
2.49.0
One of the problems (I'm sure there are more ...) with this patch is that
it uses dma_get_mask(adev->dev) as the endpoint from which to place the
memory, but dma_get_mask(adev->dev) returns the dma mask of the discrete
GPU, but what actually is needed to avoid the bug would be the dma mask
of the built-in GPU. In my case both are equal (44bits), but I'm not
sure if they are equal in all cases.
> So this patch does the trick for Bert, and I'm wondering what the best
> fix here would be overall, because it's a tricky situation.
>
> Am I correct in assuming that with enough physical memory this bug
> would trigger, with and without nokaslr?
I think the bug triggers as soon as DIRECT_MAP_PHYSMEM_END is bigger
then the dma mask of the integrated GPU.
> I *think* the best approach going forward would be to add the above
> quirk the the x86 memory setup code, but also issue a kernel warning at
> that point with all the relevant information included, so that the
> driver's use_dma32 bug can at least be indicated?
>
> That might also trigger for other systems, because if this scenario is
> so spurious, I doubt it's the only affected driver ...
>
> Thanks,
>
> Ingo
Or one could make the endpoint from which the memory resource will be
placed selectable.
Bert Karwatzki