Re: [PATCH v12 01/10] vfs, iomap: Fix generic_file_splice_read() to avoid reversion of ITER_PIPE

From: David Howells
Date: Wed Feb 08 2023 - 11:10:55 EST


Christoph Hellwig <hch@xxxxxxxxxxxxx> wrote:

> We'll have to fix reverting of pipe buffers, just as I already pointed
> out in your cifs series that tries to play the same game.

Fixing ITER_PIPE reversion isn't very straightforward, unfortunately. It's
possible for a partial direct I/O read to use reversion to trim the iterator
(thereby discarding anon pages from a pipe) and then fall back to reading the
rest by buffered I/O. I suppose I could set a flag on the pipe_buffers
indicating whether iov_iter_revert() is allowed to free them (if they were
spliced in from the pagecache, then they must be freed by reverting them).

How about one of two different solutions?

(1) Repurpose the function I proposed for generic_file_splice_read() but only
for splicing from O_DIRECT files; reading from non-O_DIRECT files would
use an ITER_PIPE as upstream.

(2) Repurpose the function I proposed for generic_file_splice_read() but only
for splicing from O_DIRECT files, as (1), but also replace the splice
from a buffered file with something like the patch below. This uses
filemap_get_pages() to do the reading and to get a bunch of folios from
the pagecache that we can then splice into the pipe directly.

David
---
splice: Do splice read from a buffered file without using ITER_PIPE

Provide a function to do splice read from a buffered file, pulling the
folios out of the pagecache directly by calling filemap_get_pages() to do
any required reading and then pasting the returned folios into the pipe.

A helper function is provided to do the actual folio pasting and will
handle multipage folios by splicing as many of the relevant subpages as
will fit into the pipe.

The ITER_BVEC-based splicing previously added is then only used for
splicing from O_DIRECT files.

The code is loosely based on filemap_read() and might belong in
mm/filemap.c with that as it needs to use filemap_get_pages().

filemap_get_pages() and some of the functions it uses are changed to take
the required byte count rather than an iterator (which was only being used
for iter->count). This should be split into a separate file.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---
fs/splice.c | 177 +++++++++++++++++++++++++++++++++++++++++++-----
include/linux/pagemap.h | 2
mm/filemap.c | 23 +++---
3 files changed, 176 insertions(+), 26 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index fba93f4a4f9e..3ccecfa50eda 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/splice.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
@@ -282,22 +283,13 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
kfree(spd->partial);
}

-/**
- * generic_file_splice_read - splice data from file to a pipe
- * @in: file to splice from
- * @ppos: position in @in
- * @pipe: pipe to splice to
- * @len: number of bytes to splice
- * @flags: splice modifier flags
- *
- * Description:
- * Will read pages from given file and fill them into a pipe. Can be
- * used as long as it has more or less sane ->read_iter().
- *
+/*
+ * Splice data from an O_DIRECT file into pages and then add them to the output
+ * pipe.
*/
-ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+static ssize_t generic_file_direct_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
{
LIST_HEAD(pages);
struct iov_iter to;
@@ -383,6 +375,161 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
kfree(bv);
return ret;
}
+
+/*
+ * Splice subpages from a folio into a pipe.
+ */
+static size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio,
+ loff_t fpos, size_t size)
+{
+ struct page *page;
+ size_t spliced = 0, offset = offset_in_folio(folio, fpos);
+
+ page = folio_page(folio, offset / PAGE_SIZE);
+ size = min(size, folio_size(folio) - offset);
+ offset %= PAGE_SIZE;
+
+ while (spliced < size &&
+ !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = &pipe->bufs[pipe->head & (pipe->ring_size - 1)];
+ size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &page_cache_pipe_buf_ops,
+ .page = page,
+ .offset = offset,
+ .len = part,
+ };
+ folio_get(folio);
+ pipe->head++;
+ page++;
+ spliced += part;
+ offset = 0;
+ }
+
+ return spliced;
+}
+
+/*
+ * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
+ * a pipe.
+ */
+static ssize_t generic_file_buffered_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags)
+{
+ struct folio_batch fbatch;
+ size_t total_spliced = 0, used, npages;
+ loff_t isize, end_offset;
+ bool writably_mapped;
+ int i, error = 0;
+
+ struct kiocb iocb = {
+ .ki_filp = in,
+ .ki_pos = *ppos,
+ };
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ folio_batch_init(&fbatch);
+
+ do {
+ cond_resched();
+
+ if (*ppos >= i_size_read(file_inode(in)))
+ break;
+
+ iocb.ki_pos = *ppos;
+ error = filemap_get_pages(&iocb, len, &fbatch);
+ if (error < 0)
+ break;
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(file_inode(in));
+ if (unlikely(*ppos >= isize))
+ break;
+ end_offset = min_t(loff_t, isize, *ppos + len);
+
+ /*
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
+ */
+ writably_mapped = mapping_writably_mapped(in->f_mapping);
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t n;
+
+ if (folio_pos(folio) >= end_offset)
+ goto out;
+ folio_mark_accessed(folio);
+
+ /*
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_folio(folio);
+
+ n = splice_folio_into_pipe(pipe, folio, *ppos, len);
+ if (!n)
+ goto out;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ goto out;
+ }
+
+ folio_batch_release(&fbatch);
+ } while (len);
+
+out:
+ folio_batch_release(&fbatch);
+ file_accessed(in);
+
+ return total_spliced ? total_spliced : error;
+}
+
+/**
+ * generic_file_splice_read - splice data from file to a pipe
+ * @in: file to splice from
+ * @ppos: position in @in
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given file and fill them into a pipe. Can be
+ * used as long as it has more or less sane ->read_iter().
+ *
+ */
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes))
+ return 0;
+ if (unlikely(!len))
+ return 0;
+ if (in->f_flags & O_DIRECT)
+ return generic_file_direct_splice_read(in, ppos, pipe, len, flags);
+ return generic_file_buffered_splice_read(in, ppos, pipe, len, flags);
+}
EXPORT_SYMBOL(generic_file_splice_read);

const struct pipe_buf_operations default_pipe_buf_ops = {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 29e1f9e76eb6..317fbc34849f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -748,6 +748,8 @@ struct page *read_cache_page(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
+int filemap_get_pages(struct kiocb *iocb, size_t count,
+ struct folio_batch *fbatch);

static inline struct page *read_mapping_page(struct address_space *mapping,
pgoff_t index, struct file *file)
diff --git a/mm/filemap.c b/mm/filemap.c
index f72e4875bfcb..f8b28352b038 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2440,10 +2440,8 @@ static int filemap_read_folio(struct file *file, filler_t filler,
}

static bool filemap_range_uptodate(struct address_space *mapping,
- loff_t pos, struct iov_iter *iter, struct folio *folio)
+ loff_t pos, size_t count, struct folio *folio)
{
- int count;
-
if (folio_test_uptodate(folio))
return true;
if (!mapping->a_ops->is_partially_uptodate)
@@ -2451,7 +2449,6 @@ static bool filemap_range_uptodate(struct address_space *mapping,
if (mapping->host->i_blkbits >= folio_shift(folio))
return false;

- count = iter->count;
if (folio_pos(folio) > pos) {
count -= folio_pos(folio) - pos;
pos = 0;
@@ -2463,7 +2460,7 @@ static bool filemap_range_uptodate(struct address_space *mapping,
}

static int filemap_update_page(struct kiocb *iocb,
- struct address_space *mapping, struct iov_iter *iter,
+ struct address_space *mapping, size_t count,
struct folio *folio)
{
int error;
@@ -2498,7 +2495,7 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock;

error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio))
goto unlock;

error = -EAGAIN;
@@ -2574,8 +2571,12 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
return 0;
}

-static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
- struct folio_batch *fbatch)
+/*
+ * Extract some folios from the pagecache of a file, reading those pages from
+ * the backing store if necessary and waiting for them.
+ */
+int filemap_get_pages(struct kiocb *iocb, size_t count,
+ struct folio_batch *fbatch)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
@@ -2585,7 +2586,7 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio;
int err = 0;

- last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
@@ -2618,7 +2619,7 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
if ((iocb->ki_flags & IOCB_WAITQ) &&
folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
- err = filemap_update_page(iocb, mapping, iter, folio);
+ err = filemap_update_page(iocb, mapping, count, folio);
if (err)
goto err;
}
@@ -2688,7 +2689,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;

- error = filemap_get_pages(iocb, iter, &fbatch);
+ error = filemap_get_pages(iocb, iov_iter_count(iter), &fbatch);
if (error < 0)
break;