Re: [PATCH v4 1/2] virtiofs: use pages instead of pointer for kernel direct IO

From: Hou Tao
Date: Tue Sep 03 2024 - 23:50:48 EST


Hi,

On 9/3/2024 4:44 PM, Jingbo Xu wrote:
>
> On 8/31/24 5:37 PM, Hou Tao wrote:
>> From: Hou Tao <houtao1@xxxxxxxxxx>
>>
>> When trying to insert a 10MB kernel module kept in a virtio-fs with cache
>> disabled, the following warning was reported:
>>

SNIP
>>
>> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem")
>> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
> Tested-by: Jingbo Xu <jefflexu@xxxxxxxxxxxxxxxxx>

Thanks for the test.
>
>
>> ---
>> fs/fuse/file.c | 62 +++++++++++++++++++++++++++++++--------------
>> fs/fuse/fuse_i.h | 6 +++++
>> fs/fuse/virtio_fs.c | 1 +
>> 3 files changed, 50 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index f39456c65ed7..331208d3e4d1 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
>> args->out_args[0].size = count;
>> }
>>
>> -

SNIP
>> static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
>> size_t *nbytesp, int write,
>> - unsigned int max_pages)
>> + unsigned int max_pages,
>> + bool use_pages_for_kvec_io)
>> {
>> + bool flush_or_invalidate = false;
>> size_t nbytes = 0; /* # bytes already packed in req */
>> ssize_t ret = 0;
>>
>> - /* Special case for kernel I/O: can copy directly into the buffer */
>> + /* Special case for kernel I/O: can copy directly into the buffer.
>> + * However if the implementation of fuse_conn requires pages instead of
>> + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
>> + */
>> if (iov_iter_is_kvec(ii)) {
>> - unsigned long user_addr = fuse_get_user_addr(ii);
>> - size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>> + void *user_addr = (void *)fuse_get_user_addr(ii);
>>
>> - if (write)
>> - ap->args.in_args[1].value = (void *) user_addr;
>> - else
>> - ap->args.out_args[0].value = (void *) user_addr;
>> + if (!use_pages_for_kvec_io) {
>> + size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>>
>> - iov_iter_advance(ii, frag_size);
>> - *nbytesp = frag_size;
>> - return 0;
>> + if (write)
>> + ap->args.in_args[1].value = user_addr;
>> + else
>> + ap->args.out_args[0].value = user_addr;
>> +
>> + iov_iter_advance(ii, frag_size);
>> + *nbytesp = frag_size;
>> + return 0;
>> + }
>> +
>> + if (is_vmalloc_addr(user_addr)) {
>> + ap->args.vmap_base = user_addr;
>> + flush_or_invalidate = true;
> Could we move flush_kernel_vmap_range() upon here, so that
> flush_or_invalidate is not needed anymore and the code looks cleaner?

flush_kernel_vmap_range() needs to know the length of the flushed area,
if moving it here(), the length will be unknown.
>
>> + }
>> }
>>
>> while (nbytes < *nbytesp && ap->num_pages < max_pages) {
>> @@ -1513,6 +1533,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
>> (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
>> }
>>
>> + if (write && flush_or_invalidate)
>> + flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
>> +
>> + ap->args.invalidate_vmap = !write && flush_or_invalidate;
> How about initializing vmap_base only when the data buffer is vmalloced
> and it's a read request? In this case invalidate_vmap is no longer needed.

You mean using the value of vmap_base to indicate whether invalidation
is needed or not, right ? I prefer to keep it, because the extra
variable invalidate_vmap indicates the required action for the vmap area
and it doesn't increase the size of fuse_args.

>
>> ap->args.is_pinned = iov_iter_extract_will_pin(ii);
>> ap->args.user_pages = true;
>> if (write)
>> @@ -1581,7 +1605,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
>> size_t nbytes = min(count, nmax);
>>
>> err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
>> - max_pages);
>> + max_pages, fc->use_pages_for_kvec_io);
>> if (err && !nbytes)
>> break;
>>
>> @@ -1595,7 +1619,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
>> }
>>
>> if (!io->async || nres < 0) {
>> - fuse_release_user_pages(&ia->ap, io->should_dirty);
>> + fuse_release_user_pages(&ia->ap, nres, io->should_dirty);
>> fuse_io_free(ia);
>> }
>> ia = NULL;
>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>> index f23919610313..79add14c363f 100644
>> --- a/fs/fuse/fuse_i.h
>> +++ b/fs/fuse/fuse_i.h
>> @@ -309,9 +309,12 @@ struct fuse_args {
>> bool may_block:1;
>> bool is_ext:1;
>> bool is_pinned:1;
>> + bool invalidate_vmap:1;
>> struct fuse_in_arg in_args[3];
>> struct fuse_arg out_args[2];
>> void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
>> + /* Used for kvec iter backed by vmalloc address */
>> + void *vmap_base;
>> };
>>
>> struct fuse_args_pages {
>> @@ -860,6 +863,9 @@ struct fuse_conn {
>> /** Passthrough support for read/write IO */
>> unsigned int passthrough:1;
>>
>> + /* Use pages instead of pointer for kernel I/O */
>> + unsigned int use_pages_for_kvec_io:1;
> Maybe we need a better (actually shorter) name for this flag. kvec_pages?

Naming is hard. The name "use_pages_for_kvec_io" is verbose indeed.
kvec_pages is better. Will update it in the next spin.
>
>> +
>> /** Maximum stack depth for passthrough backing files */
>> int max_stack_depth;
>>
>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>> index dd5260141615..43d66ab5e891 100644
>> --- a/fs/fuse/virtio_fs.c
>> +++ b/fs/fuse/virtio_fs.c
>> @@ -1568,6 +1568,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
>> fc->delete_stale = true;
>> fc->auto_submounts = true;
>> fc->sync_fs = true;
>> + fc->use_pages_for_kvec_io = true;
>>
>> /* Tell FUSE to split requests that exceed the virtqueue's size */
>> fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,