[Some people who received this message don't often get email from vivek.kasireddy@xxxxxxxxx. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
Hi Huan,
Currently, udmabuf handles folio by creating an unpin list to recordHave you tried to quantify this overhead?
each folio obtained from the list and unpinning them when released. To
maintain this approach, many data structures have been established.
However, maintaining this type of data structure requires a significant
amount of memory and traversing the list is a substantial overhead,
Agree.
which is not friendly to the CPU cache, TLB, and so on.Using pages is a step backwards, given the trend towards embracing folios.
Therefore, this patch removes the relationship between the folio and its
offset in the linear address mapping.
As an alternative, udmabuf both maintain the folio array and page array,
folio array use to unpin, and the page array is used as before to handle
the requirements for the page.
Moreover, the feedback from the former hugetlb maintainer (Mike Kravetz)I haven't considered that HVO would have an impact on this.
was to not use subpages (or tail pages) of a hugetlb folio directly in udmabuf
driver as it would cause problems, particularly when hugetlb vmemmap
optimization (HVO) is enabled. AFAIU, if HVO is enabled by default, a tail page's
struct page pointer may not be available (as it may very well be freed to
save memory). Given all of this, it made sense to convert the udmabuf driver
to only use the head pages of a folio along with the offsets of tail pages.
So, udmabuf's folios only save the folio struct, foliocount pointDoes your use-case involve frequent pinning/unpinning operations? Note
the size of array. pages save page in folios, number offset given by
create list, pagecount point the size of array.
Even if we restore the pages structure, its memory usage should be
smaller than the combined memory usage of offsets(8 bytes in 64bit
machine)
and udmabuf_folio structures(24 bytes in 64bit machine).
By doing this, we can accept the overhead of the udmabuf_folio structure
and the performance loss of traversing the list during unpinning.
that this would be considered "shortterm" pin, which is different from theCould you please describe it in detail? I didn't understand.
the way the folios are currently pinned in udmabuf driver, which is considered
"longterm" pin.
However, one optimization I can think of, for memfds backed by shmem, is
to not use unpin_list completely. This way you can probably avoid creating
udmabuf_folio objects and having to traverse the list. But this would require
differentiating udmabufs backed by shmem vs hugetlb folios, which is not
great in my opinion and may not work if THP is enabled.
Thanks,
Vivek
Signed-off-by: Huan Yang <link@xxxxxxxx>
---
drivers/dma-buf/udmabuf.c | 167 ++++++++++++++------------------------
1 file changed, 61 insertions(+), 106 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
index 9737f063b6b3..442ed99d8b33 100644
--- a/drivers/dma-buf/udmabuf.c
+++ b/drivers/dma-buf/udmabuf.c
@@ -25,17 +25,24 @@ module_param(size_limit_mb, int, 0644);
MODULE_PARM_DESC(size_limit_mb, "Max size of a dmabuf, in megabytes.
Default is 64.");
struct udmabuf {
+ /**
+ * Each page used by udmabuf in the folio. When obtaining a page
from a
+ * folio, it does not necessarily begin from the head page. This is
+ * determined by the offset of the memfd when udmabuf created.
+ */
pgoff_t pagecount;
+ struct page **pages;
+
+ /**
+ * Each folio in memfd, when a udmabuf is created, it is pinned to
+ * ensure that the folio is not moved or reclaimed.
+ * folio array used to unpin all when releasing.
+ */
+ pgoff_t foliocount;
struct folio **folios;
+
struct sg_table *sg;
struct miscdevice *device;
- pgoff_t *offsets;
- struct list_head unpin_list;
-};
-
-struct udmabuf_folio {
- struct folio *folio;
- struct list_head list;
};
static int mmap_udmabuf(struct dma_buf *buf, struct vm_area_struct
*vma)
@@ -51,9 +58,7 @@ static int mmap_udmabuf(struct dma_buf *buf, struct
vm_area_struct *vma)
for (pgoff = vma->vm_pgoff, end = vma->vm_end, addr = vma-
vm_start;addr < end; pgoff++, addr += PAGE_SIZE) {
- struct page *page =
- folio_page(ubuf->folios[pgoff],
- ubuf->offsets[pgoff] >> PAGE_SHIFT);
+ struct page *page = ubuf->pages[pgoff];
ret = remap_pfn_range(vma, addr, page_to_pfn(page),
PAGE_SIZE,
vma->vm_page_prot);
@@ -67,22 +72,11 @@ static int mmap_udmabuf(struct dma_buf *buf,
struct vm_area_struct *vma)
static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map)
{
struct udmabuf *ubuf = buf->priv;
- struct page **pages;
void *vaddr;
- pgoff_t pg;
dma_resv_assert_held(buf->resv);
- pages = kvmalloc_array(ubuf->pagecount, sizeof(*pages),
GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
-
- for (pg = 0; pg < ubuf->pagecount; pg++)
- pages[pg] = folio_page(ubuf->folios[pg],
- ubuf->offsets[pg] >> PAGE_SHIFT);
-
- vaddr = vm_map_ram(pages, ubuf->pagecount, -1);
- kvfree(pages);
+ vaddr = vm_map_ram(ubuf->pages, ubuf->pagecount, -1);
if (!vaddr)
return -EINVAL;
@@ -104,30 +98,25 @@ static struct sg_table *get_sg_table(struct device
*dev, struct dma_buf *buf,
{
struct udmabuf *ubuf = buf->priv;
struct sg_table *sg;
- struct scatterlist *sgl;
- unsigned int i = 0;
int ret;
sg = kzalloc(sizeof(*sg), GFP_KERNEL);
if (!sg)
return ERR_PTR(-ENOMEM);
- ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
+ ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
+ 0, ubuf->pagecount << PAGE_SHIFT,
+ GFP_KERNEL);
if (ret < 0)
- goto err_alloc;
-
- for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)
- sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,
- ubuf->offsets[i]);
+ goto err;
ret = dma_map_sgtable(dev, sg, direction, 0);
if (ret < 0)
- goto err_map;
+ goto err;
return sg;
-err_map:
+err:
sg_free_table(sg);
-err_alloc:
kfree(sg);
return ERR_PTR(ret);
}
@@ -153,34 +142,6 @@ static void unmap_udmabuf(struct
dma_buf_attachment *at,
return put_sg_table(at->dev, sg, direction);
}
-static void unpin_all_folios(struct list_head *unpin_list)
-{
- struct udmabuf_folio *ubuf_folio;
-
- while (!list_empty(unpin_list)) {
- ubuf_folio = list_first_entry(unpin_list,
- struct udmabuf_folio, list);
- unpin_folio(ubuf_folio->folio);
-
- list_del(&ubuf_folio->list);
- kfree(ubuf_folio);
- }
-}
-
-static int add_to_unpin_list(struct list_head *unpin_list,
- struct folio *folio)
-{
- struct udmabuf_folio *ubuf_folio;
-
- ubuf_folio = kzalloc(sizeof(*ubuf_folio), GFP_KERNEL);
- if (!ubuf_folio)
- return -ENOMEM;
-
- ubuf_folio->folio = folio;
- list_add_tail(&ubuf_folio->list, unpin_list);
- return 0;
-}
-
static void release_udmabuf(struct dma_buf *buf)
{
struct udmabuf *ubuf = buf->priv;
@@ -189,9 +150,9 @@ static void release_udmabuf(struct dma_buf *buf)
if (ubuf->sg)
put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL);
- unpin_all_folios(&ubuf->unpin_list);
- kvfree(ubuf->offsets);
+ unpin_folios(ubuf->folios, ubuf->foliocount);
kvfree(ubuf->folios);
+ kvfree(ubuf->pages);
kfree(ubuf);
}
@@ -289,19 +250,18 @@ static long udmabuf_create(struct miscdevice
*device,
struct udmabuf_create_list *head,
struct udmabuf_create_item *list)
{
- pgoff_t pgoff, pgcnt, pglimit, pgbuf = 0;
- long nr_folios, ret = -EINVAL;
+ pgoff_t pgoff, pgcnt, pglimit, nr_pages;
+ long nr_folios = 0, ret = -EINVAL;
struct file *memfd = NULL;
struct folio **folios;
struct udmabuf *ubuf;
- u32 i, j, k, flags;
+ u32 i, flags;
loff_t end;
ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL);
if (!ubuf)
return -ENOMEM;
- INIT_LIST_HEAD(&ubuf->unpin_list);
pglimit = (size_limit_mb * 1024 * 1024) >> PAGE_SHIFT;
for (i = 0; i < head->count; i++) {
if (!IS_ALIGNED(list[i].offset, PAGE_SIZE))
@@ -322,64 +282,58 @@ static long udmabuf_create(struct miscdevice
*device,
ret = -ENOMEM;
goto err;
}
- ubuf->offsets =
- kvcalloc(ubuf->pagecount, sizeof(*ubuf->offsets),
GFP_KERNEL);
- if (!ubuf->offsets) {
+ folios = ubuf->folios;
+
+ ubuf->pages = kvmalloc_array(ubuf->pagecount, sizeof(*ubuf-
pages),+ GFP_KERNEL);
+ if (!ubuf->pages) {
ret = -ENOMEM;
goto err;
}
- pgbuf = 0;
- for (i = 0; i < head->count; i++) {
+ for (i = 0, nr_pages = 0; i < head->count; i++) {
+ u32 j, pg;
+
memfd = fget(list[i].memfd);
ret = check_memfd_seals(memfd);
if (ret < 0)
goto err;
pgcnt = list[i].size >> PAGE_SHIFT;
- folios = kvmalloc_array(pgcnt, sizeof(*folios), GFP_KERNEL);
- if (!folios) {
- ret = -ENOMEM;
- goto err;
- }
end = list[i].offset + (pgcnt << PAGE_SHIFT) - 1;
- ret = memfd_pin_folios(memfd, list[i].offset, end,
- folios, pgcnt, &pgoff);
+ ret = memfd_pin_folios(memfd, list[i].offset, end, folios,
+ pgcnt, &pgoff);
if (ret <= 0) {
- kvfree(folios);
- if (!ret)
- ret = -EINVAL;
+ ret = ret ?: -EINVAL;
goto err;
}
- nr_folios = ret;
- pgoff >>= PAGE_SHIFT;
- for (j = 0, k = 0; j < pgcnt; j++) {
- ubuf->folios[pgbuf] = folios[k];
- ubuf->offsets[pgbuf] = pgoff << PAGE_SHIFT;
-
- if (j == 0 || ubuf->folios[pgbuf-1] != folios[k]) {
- ret = add_to_unpin_list(&ubuf->unpin_list,
- folios[k]);
- if (ret < 0) {
- kfree(folios);
- goto err;
- }
- }
-
- pgbuf++;
- if (++pgoff == folio_nr_pages(folios[k])) {
- pgoff = 0;
- if (++k == nr_folios)
- break;
+ /**
+ * Iter the pinned folios and record them for later unpin
+ * when releasing.
+ * memfd may start from any offset, so we need check it
+ * carefully at first.
+ */
+ for (j = 0, pgoff >>= PAGE_SHIFT, pg = 0; j < ret;
+ ++j, pgoff = 0) {
+ pgoff_t k;
+ struct folio *folio = folios[j];
+
+ for (k = pgoff; k < folio_nr_pages(folio); ++k) {
+ ubuf->pages[nr_pages++] = folio_page(folio,
k);
+
+ if (++pg >= pgcnt)
+ goto end;
}
}
-
- kvfree(folios);
+end:
+ folios += ret;
+ nr_folios += ret;
fput(memfd);
memfd = NULL;
}
+ ubuf->foliocount = nr_folios;
flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0;
ret = export_udmabuf(ubuf, device, flags);
@@ -391,8 +345,9 @@ static long udmabuf_create(struct miscdevice
*device,
err:
if (memfd)
fput(memfd);
- unpin_all_folios(&ubuf->unpin_list);
- kvfree(ubuf->offsets);
+ if (nr_folios)
+ unpin_folios(ubuf->folios, nr_folios);
+ kvfree(ubuf->pages);
kvfree(ubuf->folios);
kfree(ubuf);
return ret;
--
2.45.2