[RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
From: sw . prabhu6
Date: Mon Jun 08 2026 - 02:30:15 EST
From: Swarna Prabhu <sw.prabhu6@xxxxxxxxx>
io_uring sqe buffer registration path returns pinned user pages in 4k
granularity. If the first pinned page is in a hugetlb folio and
pages[nr_pages - 1] is also in the same folio then store a single page
entry and report *npages = 1 while dropping nr_pages - 1 of the pin
references it took earlier.
io_uring has support to identify and coalesce multi-hugepage-backed
fixed buffers from the function 'io_check_coalesce_buffer()'. However
we need to iterate over the entire page array and this patch bypasses
the additional checks for this case. The fast path reduces the overall
sqe buffer registration time that are backed by huge pages.
Measured with fio on bare metal backed by 1024 boot-allocated 2MB hugetlb
pages and setting the cpu cores to governor for max performance.
(hugepages=1024,hugepage_size=2M):
fio --ioengine=io_uring --rw=randwrite --bs=1M --size=2G --iodepth=256
--direct=1 --numjobs=5 --fixedbufs=1 --registerfiles=1 --iomem=mmaphuge
--hugepage-size=2M.
Avg across 3 runs:
Metric Upstream(7.1-rc1) Patched Delta
Reg time(io_sqe_buffer_register): 3797ns 2970ns -21.8%
Total reg for workload: 14.35ms 11.34ms -21.9%
fio write bandwidth: 1416MiB/s 1416MiB/s No regression
Signed-off-by: Swarna Prabhu <s.prabhu@xxxxxxxxxxx>
---
io_uring/memmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
io_uring/memmap.h | 3 +++
io_uring/rsrc.c | 9 +++++--
3 files changed, 74 insertions(+), 4 deletions(-)
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 4f9b439319c4..957e67d2d8e8 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -37,11 +37,11 @@ static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
return true;
}
-struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+struct page **io_pin_pages_alloc(unsigned long uaddr, unsigned long len,
+ unsigned long *nr_pages_out)
{
unsigned long start, end, nr_pages;
struct page **pages;
- int ret;
if (check_add_overflow(uaddr, len, &end))
return ERR_PTR(-EOVERFLOW);
@@ -60,6 +60,20 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
if (!pages)
return ERR_PTR(-ENOMEM);
+ *nr_pages_out = nr_pages;
+ return pages;
+}
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+{
+ unsigned long nr_pages;
+ struct page **pages;
+ int ret;
+
+ pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
+ if (IS_ERR(pages))
+ return pages;
+
ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
pages);
/* success, mapped all pages */
@@ -79,6 +93,54 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
return ERR_PTR(ret);
}
+struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages)
+{
+ unsigned long nr_pages;
+ struct page **pages;
+ int ret;
+
+ pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
+ if (IS_ERR(pages))
+ return pages;
+
+ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
+ struct folio *folio = page_folio(pages[0]);
+
+ if (nr_pages > 1 && folio_test_hugetlb(folio) &&
+ page_folio(pages[nr_pages - 1]) == folio) {
+ struct page **huge_pages;
+
+ huge_pages = kvmalloc_objs(struct page *, 1, GFP_KERNEL_ACCOUNT);
+ if (!huge_pages) {
+ *npages = nr_pages;
+ return pages;
+ }
+ unpin_user_folio(folio, nr_pages - 1);
+
+ huge_pages[0] = pages[0];
+ kvfree(pages);
+ pages = huge_pages;
+ *npages = 1;
+ } else {
+ *npages = nr_pages;
+ }
+ return pages;
+ }
+
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
+ /* if we did partial map, release any pages we did get */
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
+ }
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
enum {
/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
IO_REGION_F_VMAP = 1,
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index f4cfbb6b9a1f..cc41af3fae61 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -7,7 +7,10 @@
#define IORING_OFF_ZCRX_SHIFT 16
+struct page **io_pin_pages_alloc(unsigned long uaddr, unsigned long len,
+ unsigned long *nr_pages_out);
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
+struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 650303626be6..e117b10bef0b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -771,7 +771,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
struct io_rsrc_node *node;
unsigned long off;
size_t size;
- int ret, nr_pages, i;
+ int ret, nr_pages, i, orig_nr_pages;
struct io_imu_folio_data data;
bool coalesced = false;
@@ -792,7 +792,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
- pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+ orig_nr_pages = ((unsigned long)iov->iov_base + iov->iov_len
+ + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ orig_nr_pages -= (unsigned long)iov->iov_base >> PAGE_SHIFT;
+ pages = io_pin_pages_fast_path((unsigned long) iov->iov_base, iov->iov_len,
&nr_pages);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
@@ -826,6 +829,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
if (coalesced)
imu->folio_shift = data.folio_shift;
+ else if (nr_pages == 1 && orig_nr_pages > 1)
+ imu->folio_shift = folio_shift(page_folio(pages[0]));
refcount_set(&imu->refs, 1);
off = (unsigned long)iov->iov_base & ~PAGE_MASK;
--
2.39.5