[RFC PATCH 1/2] iov_iter: Add a function to extract a page list from an iterator

From: David Howells
Date: Mon Oct 31 2022 - 10:50:50 EST


Add a function, iov_iter_extract_pages(), to extract a list of pages from
an iterator. The pages may be returned with a reference added or a pin
added or neither, depending on the type of iterator and the direction of
transfer.

If the transfer is *into* an ITER_IOVEC or ITER_UBUF iterator, the pages
will have pins obtained on them, but not references, to avoid fork()
CoW'ing the pages incorrectly whilst DMA read is in progess.

If the transfer is *out of* an ITER_IOVEC or ITER_UBUF iterator, the pages
will have references obtained on them, but not pins. DMA write is assumed
not to change the page contents.

For any other sort of iterator, no refs or pins are obtained on the page,
the assumption being that the caller will manage retention.

An additional function, iov_iter_extract_mode() is provided to indicate
what the mode of retention employed will be for that iterator. This is
indicated as 0 (no special retention), FOLL_GET (will get page refs) or
FOLL_PIN (will get page pins). This can be used by the caller to work out
how to dispose of the pages afterwards: nothing, put_page() or
unpin_user_page().

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
cc: Christoph Hellwig <hch@xxxxxx>
cc: John Hubbard <jhubbard@xxxxxxxxxx>
cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
cc: linux-fsdevel@xxxxxxxxxxxxxxx
cc: linux-mm@xxxxxxxxxxxxxxx
---

include/linux/uio.h | 26 ++++
lib/iov_iter.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 359 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 87fc3d0dda98..1c9b9e8ba777 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -354,4 +354,30 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
};
}

+ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
+ size_t maxsize, unsigned int maxpages,
+ size_t *offset0);
+
+/**
+ * iov_iter_extract_mode - Indicate how pages from the iterator will be retained
+ * @iter: The iterator
+ *
+ * Examine the indicator and indicate with FOLL_PIN, FOLL_GET or 0 as to how,
+ * if at all, pages extracted from the iterator will be retained by the
+ * extraction function.
+ *
+ * FOLL_GET indicates that the pages will have a reference taken on them that
+ * the caller must put. This can be done for DMA/async DIO write from a page.
+ *
+ * FOLL_PIN indicates that the pages will have a pin placed in them that the
+ * caller must unpin. This is must be done for DMA/async DIO read to a page to
+ * avoid CoW problems in fork.
+ *
+ * 0 indicates that no measures are taken and that it's up to the caller to
+ * retain the pages.
+ */
+#define iov_iter_extract_mode(iter) \
+ (!user_backed_iter(iter) ? 0 : \
+ (iter)->data_source ? FOLL_GET : FOLL_PIN)
+
#endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 98e8425b060d..96bfb117f19a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1898,3 +1898,336 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
i->iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}
+
+/*
+ * Extract a list of contiguous pages from an ITER_PIPE iterator. This does
+ * not get references of its own on the pages, nor does it get a pin on them.
+ * If there's a partial page, it adds that first and will then allocate and add
+ * pages into the pipe to make up the buffer space to the amount required.
+ *
+ * The caller must hold the pipe locked and only transferring into a pipe is
+ * supported.
+ */
+static ssize_t iov_iter_extract_pipe_pages(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ unsigned int maxpages,
+ size_t *offset0)
+{
+ unsigned int nr, offset, chunk, j;
+ struct page **p;
+ size_t left;
+
+ if (!sanity(i))
+ return -EFAULT;
+
+ offset = pipe_npages(i, &nr);
+ if (!nr)
+ return -EFAULT;
+ *offset0 = offset;
+
+ maxpages = min_t(size_t, nr, maxpages);
+ maxpages = want_pages_array(pages, maxsize, offset, maxpages);
+ if (!maxpages)
+ return -ENOMEM;
+ p = *pages;
+
+ left = maxsize;
+ for (j = 0; j < maxpages; j++) {
+ struct page *page = append_pipe(i, left, &offset);
+ if (!page)
+ break;
+ chunk = min_t(size_t, left, PAGE_SIZE - offset);
+ left -= chunk;
+ *p++ = page;
+ }
+ if (!j)
+ return -EFAULT;
+ return maxsize - left;
+}
+
+/*
+ * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not
+ * get references on the pages, nor does it get a pin on them.
+ */
+static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ unsigned int maxpages,
+ size_t *offset0)
+{
+ struct page *page, **p;
+ unsigned int nr = 0, offset;
+ loff_t pos = i->xarray_start + i->iov_offset;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ XA_STATE(xas, i->xarray, index);
+
+ offset = pos & ~PAGE_MASK;
+ *offset0 = offset;
+
+ maxpages = want_pages_array(pages, maxsize, offset, maxpages);
+ if (!maxpages)
+ return -ENOMEM;
+ p = *pages;
+
+ rcu_read_lock();
+ for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas))) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ p[nr++] = find_subpage(page, xas.xa_index);
+ if (nr == maxpages)
+ break;
+ }
+ rcu_read_unlock();
+
+ maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
+ i->iov_offset += maxsize;
+ i->count -= maxsize;
+ return maxsize;
+}
+
+/*
+ * Extract a list of contiguous pages from an ITER_BVEC iterator. This does
+ * not get references on the pages, nor does it get a pin on them.
+ */
+static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ unsigned int maxpages,
+ size_t *offset0)
+{
+ struct page **p, *page;
+ size_t skip = i->iov_offset, offset;
+ int k;
+
+ maxsize = min(maxsize, i->bvec->bv_len - skip);
+ skip += i->bvec->bv_offset;
+ page = i->bvec->bv_page + skip / PAGE_SIZE;
+ offset = skip % PAGE_SIZE;
+ *offset0 = offset;
+
+ maxpages = want_pages_array(pages, maxsize, offset, maxpages);
+ if (!maxpages)
+ return -ENOMEM;
+ p = *pages;
+ for (k = 0; k < maxpages; k++)
+ p[k] = page + k;
+
+ maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset);
+ i->count -= maxsize;
+ i->iov_offset += maxsize;
+ if (i->iov_offset == i->bvec->bv_len) {
+ i->iov_offset = 0;
+ i->bvec++;
+ i->nr_segs--;
+ }
+ return maxsize;
+}
+
+/*
+ * Get the first segment from an ITER_UBUF or ITER_IOVEC iterator. The
+ * iterator must not be empty.
+ */
+static unsigned long iov_iter_extract_first_user_segment(const struct iov_iter *i,
+ size_t *size)
+{
+ size_t skip;
+ long k;
+
+ if (iter_is_ubuf(i))
+ return (unsigned long)i->ubuf + i->iov_offset;
+
+ for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
+ size_t len = i->iov[k].iov_len - skip;
+
+ if (unlikely(!len))
+ continue;
+ if (*size > len)
+ *size = len;
+ return (unsigned long)i->iov[k].iov_base + skip;
+ }
+ BUG(); // if it had been empty, we wouldn't get called
+}
+
+/*
+ * Extract a list of contiguous pages from a user iterator and get references
+ * on them. This should only be used iff the iterator is user-backed
+ * (IOBUF/UBUF) and data is being transferred out of the buffer described by
+ * the iterator (ie. this is the source).
+ *
+ * The pages are returned with incremented refcounts that the caller must undo
+ * once the transfer is complete, but no additional pins are obtained.
+ *
+ * This is only safe to be used where background IO/DMA is not going to be
+ * modifying the buffer, and so won't cause a problem with CoW on fork.
+ */
+static ssize_t iov_iter_extract_user_pages_and_get(struct iov_iter *i,
+ struct page ***pages,
+ size_t maxsize,
+ unsigned int maxpages,
+ size_t *offset0)
+{
+ unsigned long addr;
+ unsigned int gup_flags = FOLL_GET;
+ size_t offset;
+ int res;
+
+ if (WARN_ON_ONCE(iov_iter_rw(i) != WRITE))
+ return -EFAULT;
+
+ if (i->nofault)
+ gup_flags |= FOLL_NOFAULT;
+
+ addr = iov_iter_extract_first_user_segment(i, &maxsize);
+ *offset0 = offset = addr % PAGE_SIZE;
+ addr &= PAGE_MASK;
+ maxpages = want_pages_array(pages, maxsize, offset, maxpages);
+ if (!maxpages)
+ return -ENOMEM;
+ res = get_user_pages_fast(addr, maxpages, gup_flags, *pages);
+ if (unlikely(res <= 0))
+ return res;
+ maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
+ iov_iter_advance(i, maxsize);
+ return maxsize;
+}
+
+/*
+ * Extract a list of contiguous pages from a user iterator and get a pin on
+ * each of them. This should only be used iff the iterator is user-backed
+ * (IOBUF/UBUF) and data is being transferred into the buffer described by the
+ * iterator (ie. this is the destination).
+ *
+ * It does not get refs on the pages, but the pages must be unpinned by the
+ * caller once the transfer is complete.
+ *
+ * This is safe to be used where background IO/DMA *is* going to be modifying
+ * the buffer; using a pin rather than a ref makes sure that CoW happens
+ * correctly in the parent during fork.
+ */
+static ssize_t iov_iter_extract_user_pages_and_pin(struct iov_iter *i,
+ struct page ***pages,
+ size_t maxsize,
+ unsigned int maxpages,
+ size_t *offset0)
+{
+ unsigned long addr;
+ unsigned int gup_flags = FOLL_PIN | FOLL_WRITE;
+ size_t offset;
+ int res;
+
+ if (WARN_ON_ONCE(iov_iter_rw(i) != READ))
+ return -EFAULT;
+
+ if (i->nofault)
+ gup_flags |= FOLL_NOFAULT;
+
+ addr = first_iovec_segment(i, &maxsize);
+ *offset0 = offset = addr % PAGE_SIZE;
+ addr &= PAGE_MASK;
+ maxpages = want_pages_array(pages, maxsize, offset, maxpages);
+ if (!maxpages)
+ return -ENOMEM;
+ res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
+ if (unlikely(res <= 0))
+ return res;
+ maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
+ iov_iter_advance(i, maxsize);
+ return maxsize;
+}
+
+static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ unsigned int maxpages,
+ size_t *offset0)
+{
+ switch (iov_iter_extract_mode(i)) {
+ case FOLL_GET:
+ return iov_iter_extract_user_pages_and_get(i, pages, maxsize,
+ maxpages, offset0);
+ case FOLL_PIN:
+ return iov_iter_extract_user_pages_and_pin(i, pages, maxsize,
+ maxpages, offset0);
+ default:
+ BUG();
+ }
+}
+
+/**
+ * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
+ * @i: The iterator to extract from
+ * @pages: Where to return the list of pages
+ * @maxsize: The maximum amount of iterator to extract
+ * @maxpages: The maximum size of the list of pages
+ * @offset0: Where to return the starting offset into (*@pages)[0]
+ *
+ * Extract a list of contiguous pages from the current point of the iterator,
+ * advancing the iterator. The maximum number of pages and the maximum amount
+ * of page contents can be set.
+ *
+ * If *@pages is NULL, a page list will be allocated to the required size and
+ * *@pages will be set to its base. If *@pages is not NULL, it will be assumed
+ * that the caller allocated a page list at least @maxpages in size and this
+ * will be filled in.
+ *
+ * Extra refs or pins on the pages may be obtained as follows:
+ *
+ * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF) and data is to be
+ * transferred /OUT OF/ the described buffer, refs will be taken on the
+ * pages, but pins will not be added. This can be used for DMA from a
+ * page; it cannot be used for DMA to a page, as it may cause page-COW
+ * problems in fork.
+ *
+ * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF) and data is to be
+ * transferred /INTO/ the described buffer, pins will be added to the
+ * pages, but refs will not be taken. This must be used for DMA to a
+ * page.
+ *
+ * (*) If the iterator is ITER_PIPE, this must describe a destination for the
+ * data. Additional pages may be allocated and added to the pipe (which
+ * will hold the refs), but neither refs nor pins will be obtained for the
+ * caller. The caller must hold the pipe lock.
+ *
+ * (*) If the iterator is ITER_BVEC or ITER_XARRAY, the pages are merely
+ * listed; no extra refs or pins are obtained.
+ *
+ * Note also:
+ *
+ * (*) Use with ITER_KVEC is not supported as that may refer to memory that
+ * doesn't have associated page structs.
+ *
+ * (*) Use with ITER_DISCARD is not supported as that has no content.
+ *
+ * On success, the function sets *@pages to the new pagelist, if allocated, and
+ * sets *offset0 to the offset into the first page and returns the amount of
+ * buffer space added represented by the page list.
+ *
+ * It may also return -ENOMEM and -EFAULT.
+ */
+ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
+ size_t maxsize, unsigned int maxpages,
+ size_t *offset0)
+{
+ maxsize = min3(maxsize, i->count, MAX_RW_COUNT);
+ if (!maxsize)
+ return 0;
+
+ if (likely(user_backed_iter(i)))
+ return iov_iter_extract_user_pages(i, pages, maxsize,
+ maxpages, offset0);
+ if (iov_iter_is_bvec(i))
+ return iov_iter_extract_bvec_pages(i, pages, maxsize,
+ maxpages, offset0);
+ if (iov_iter_is_pipe(i))
+ return iov_iter_extract_pipe_pages(i, pages, maxsize,
+ maxpages, offset0);
+ if (iov_iter_is_xarray(i))
+ return iov_iter_extract_xarray_pages(i, pages, maxsize,
+ maxpages, offset0);
+ return -EFAULT;
+}
+EXPORT_SYMBOL(iov_iter_extract_pages);