[PATCH v6 18/34] dio: Pin pages rather than ref'ing if appropriate

From: David Howells
Date: Mon Jan 16 2023 - 18:14:42 EST


Convert the generic direct-I/O code to use iov_iter_extract_pages() instead
of iov_iter_get_pages(). This will pin pages or leave them unaltered
rather than getting a ref on them as appropriate to the iterator.

The pages need to be pinned for DIO-read rather than having refs taken on
them to prevent VM copy-on-write from malfunctioning during a concurrent
fork() (the result of the I/O would otherwise end up only visible to the
child process and not the parent).

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
cc: Jens Axboe <axboe@xxxxxxxxx>
cc: Jan Kara <jack@xxxxxxx>
cc: Christoph Hellwig <hch@xxxxxx>
cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
cc: Logan Gunthorpe <logang@xxxxxxxxxxxx>
cc: linux-fsdevel@xxxxxxxxxxxxxxx
cc: linux-block@xxxxxxxxxxxxxxx
---

fs/direct-io.c | 57 ++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b1e26a706e31..b4d2c9f85a5b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -142,9 +142,11 @@ struct dio {

/*
* pages[] (and any fields placed after it) are not zeroed out at
- * allocation time. Don't add new fields after pages[] unless you
- * wish that they not be zeroed.
+ * allocation time. Don't add new fields after pages[] unless you wish
+ * that they not be zeroed. Pages may have a ref taken, a pin emplaced
+ * or no retention measures.
*/
+ unsigned int cleanup_mode; /* How pages should be cleaned up (0/FOLL_GET/PIN) */
union {
struct page *pages[DIO_PAGES]; /* page buffer */
struct work_struct complete_work;/* deferred AIO completion */
@@ -167,12 +169,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
const enum req_op dio_op = dio->opf & REQ_OP_MASK;
+ unsigned int gup_flags =
+ op_is_write(dio_op) ? FOLL_SOURCE_BUF : FOLL_DEST_BUF;
+ struct page **pages = dio->pages;
ssize_t ret;

- ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
- &sdio->from,
- op_is_write(dio_op) ?
- FOLL_SOURCE_BUF : FOLL_DEST_BUF);
+ ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, DIO_PAGES,
+ gup_flags, &sdio->from);

if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
struct page *page = ZERO_PAGE(0);
@@ -183,7 +186,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
*/
if (dio->page_errors == 0)
dio->page_errors = ret;
- get_page(page);
+ dio->cleanup_mode = 0;
dio->pages[0] = page;
sdio->head = 0;
sdio->tail = 1;
@@ -197,6 +200,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
sdio->head = 0;
sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+ dio->cleanup_mode =
+ iov_iter_extract_mode(sdio->iter, gup_flags);
return 0;
}
return ret;
@@ -400,6 +405,10 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
* we request a valid number of vectors.
*/
bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL);
+ if (!(dio->cleanup_mode & FOLL_GET))
+ bio_clear_flag(bio, BIO_PAGE_REFFED);
+ if (dio->cleanup_mode & FOLL_PIN)
+ bio_set_flag(bio, BIO_PAGE_PINNED);
bio->bi_iter.bi_sector = first_sector;
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
@@ -443,13 +452,18 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
sdio->logical_offset_in_bio = 0;
}

+static void dio_cleanup_page(struct dio *dio, struct page *page)
+{
+ page_put_unpin(page, dio->cleanup_mode);
+}
+
/*
* Release any resources in case of a failure
*/
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
while (sdio->head < sdio->tail)
- put_page(dio->pages[sdio->head++]);
+ dio_cleanup_page(dio, dio->pages[sdio->head++]);
}

/*
@@ -704,7 +718,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
*
* Return zero on success. Non-zero means the caller needs to start a new BIO.
*/
-static inline int dio_bio_add_page(struct dio_submit *sdio)
+static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
{
int ret;

@@ -771,11 +785,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
goto out;
}

- if (dio_bio_add_page(sdio) != 0) {
+ if (dio_bio_add_page(dio, sdio) != 0) {
dio_bio_submit(dio, sdio);
ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
if (ret == 0) {
- ret = dio_bio_add_page(sdio);
+ ret = dio_bio_add_page(dio, sdio);
BUG_ON(ret != 0);
}
}
@@ -832,13 +846,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
*/
if (sdio->cur_page) {
ret = dio_send_cur_page(dio, sdio, map_bh);
- put_page(sdio->cur_page);
+ dio_cleanup_page(dio, sdio->cur_page);
sdio->cur_page = NULL;
if (ret)
return ret;
}

- get_page(page); /* It is in dio */
+ ret = try_grab_page(page, dio->cleanup_mode); /* It is in dio */
+ if (ret < 0)
+ return ret;
+
sdio->cur_page = page;
sdio->cur_page_offset = offset;
sdio->cur_page_len = len;
@@ -853,7 +870,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
ret = dio_send_cur_page(dio, sdio, map_bh);
if (sdio->bio)
dio_bio_submit(dio, sdio);
- put_page(sdio->cur_page);
+ dio_cleanup_page(dio, sdio->cur_page);
sdio->cur_page = NULL;
}
return ret;
@@ -954,7 +971,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,

ret = get_more_blocks(dio, sdio, map_bh);
if (ret) {
- put_page(page);
+ dio_cleanup_page(dio, page);
goto out;
}
if (!buffer_mapped(map_bh))
@@ -999,7 +1016,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,

/* AKPM: eargh, -ENOTBLK is a hack */
if (dio_op == REQ_OP_WRITE) {
- put_page(page);
+ dio_cleanup_page(dio, page);
return -ENOTBLK;
}

@@ -1012,7 +1029,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
if (sdio->block_in_file >=
i_size_aligned >> blkbits) {
/* We hit eof */
- put_page(page);
+ dio_cleanup_page(dio, page);
goto out;
}
zero_user(page, from, 1 << blkbits);
@@ -1052,7 +1069,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
sdio->next_block_for_io,
map_bh);
if (ret) {
- put_page(page);
+ dio_cleanup_page(dio, page);
goto out;
}
sdio->next_block_for_io += this_chunk_blocks;
@@ -1068,7 +1085,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
}

/* Drop the ref which was taken in get_user_pages() */
- put_page(page);
+ dio_cleanup_page(dio, page);
}
out:
return ret;
@@ -1288,7 +1305,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
if (retval == 0)
retval = ret2;
- put_page(sdio.cur_page);
+ dio_cleanup_page(dio, sdio.cur_page);
sdio.cur_page = NULL;
}
if (sdio.bio)