Re: [PATCH v2] fuse: back uncached readdir buffers with pages
From: Miklos Szeredi
Date: Wed Apr 29 2026 - 03:31:06 EST
On Wed, 29 Apr 2026 at 01:30, Matthew R. Ochs <mochs@xxxxxxxxxx> wrote:
> The larger buffer is also currently supplied as a kvec output argument.
> For virtiofs, kvec arguments are copied through req->argbuf, which is
> allocated with kmalloc(..., GFP_ATOMIC). A large readdir buffer can
> therefore require a multi-megabyte contiguous atomic allocation and fail
> with -ENOMEM.
Shouldn't this be max_read? Here "read" and "write" refer to
direction of I/O on the filesystem, not on the fuse device (see
fuse/file.c)
> @@ -343,17 +344,45 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> struct fuse_mount *fm = get_fuse_mount(inode);
> struct fuse_conn *fc = fm->fc;
> struct fuse_io_args ia = {};
> - struct fuse_args *args = &ia.ap.args;
> + struct fuse_args_pages *ap = &ia.ap;
> + struct fuse_args *args = &ap->args;
> + struct page **pages;
> void *buf;
> - size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT);
> + size_t max_bufsize = min_t(size_t, (size_t)fc->max_pages << PAGE_SHIFT,
No need to cast if using the min_t variant.
> + fc->max_write);
> + size_t count = ctx->count > 0 ? ctx->count : PAGE_SIZE;
This is open coding the max_t(size_t, count, PAGE_SIZE) in the next
line. Just delete.
> + size_t bufsize = min_t(size_t, max_t(size_t, count, PAGE_SIZE),
> + max_bufsize);
What's wrong with the clamp() construct used originally?
> + unsigned int nr_pages = DIV_ROUND_UP(bufsize, PAGE_SIZE);
> u64 attr_version = 0, evict_ctr = 0;
> bool locked;
> + unsigned int nr_alloc = 0;
> + unsigned int i;
>
> - buf = kvmalloc(bufsize, GFP_KERNEL);
> - if (!buf)
> + pages = kvcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
struct page **pages __free(kvfree) = kvcalloc(nr_pages,
sizeof(*pages), GFP_KERNEL);
> + if (!pages)
> return -ENOMEM;
>
> - args->out_args[0].value = buf;
> + while (nr_alloc < nr_pages) {
> + unsigned int last = nr_alloc;
> +
> + nr_alloc = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
> + if (nr_alloc == last)
> + goto nomem;
> + }
I'd try this without the loop for less complexity. Falling back to a
shorter read shouldn't be a problem, as long as this doesn't happen
very often.
> +
> + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs);
> + if (!ap->folios)
> + goto nomem;
> +
> + for (i = 0; i < nr_pages; i++) {
> + ap->folios[i] = page_folio(pages[i]);
> + ap->descs[i].length = min_t(size_t,
> + bufsize - (size_t)i * PAGE_SIZE,
> + PAGE_SIZE);
> + }
> + ap->num_folios = nr_pages;
> + args->out_pages = true;
>
> plus = fuse_use_readdirplus(inode, ctx);
> if (plus) {
> @@ -372,17 +401,35 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>
> if (ff->open_flags & FOPEN_CACHE_DIR)
> fuse_readdir_cache_end(file, ctx->pos);
> - } else if (plus) {
> - res = parse_dirplusfile(buf, res, file, ctx, attr_version,
> - evict_ctr);
> } else {
> - res = parse_dirfile(buf, res, file, ctx);
> + buf = vm_map_ram(pages, nr_pages, -1);
> + if (!buf) {
> + res = -ENOMEM;
> + } else {
> + if (plus)
> + res = parse_dirplusfile(buf, res, file, ctx,
> + attr_version,
> + evict_ctr);
> + else
> + res = parse_dirfile(buf, res, file, ctx);
> +
> + vm_unmap_ram(buf, nr_pages);
> + }
> }
> }
>
> - kvfree(buf);
> fuse_invalidate_atime(inode);
> +
> +out:
> + kfree(ap->folios);
> + for (i = 0; i < nr_alloc; i++)
> + __free_page(pages[i]);
release_pages()
> + kvfree(pages);
> return res;
> +
> +nomem:
> + res = -ENOMEM;
> + goto out;
Usual pattern is to just do res = -ENOMEM before each goto out (or
just the first if nothing else modifies res), so no double jump unless
absolutely necessary.
Thanks,
Miklos