[RFC PATCH v2 4/5] mm: Make __swap_writepage() do async DIO if asked for it

From: David Howells
Date: Thu Aug 12 2021 - 16:22:51 EST


Make __swap_writepage()'s DIO path do sync DIO if the writeback control's
sync mode is WB_SYNC_ALL and async DIO if not.

Note that this causes hanging processes in sunrpc if the swapfile is on
NFS. I'm not sure whether it's due to misscheduling or something else.

Suggested-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
cc: linux-nfs@xxxxxxxxxxxxxxx
---

mm/page_io.c | 145 +++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 102 insertions(+), 43 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 92ec4a7b0545..dae7bbd7a842 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -300,6 +300,105 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
#define bio_associate_blkg_from_page(bio, page) do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */

+static void __swapfile_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct page *page = iocb->ki_swap_page;
+
+ if (ret == thp_size(page)) {
+ count_vm_event(PSWPOUT);
+ ret = 0;
+ } else {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited memory
+ * for allocating transmit buffers. Mark the page
+ * dirty and avoid folio_rotate_reclaimable but
+ * rate-limit the messages but do not flag PageError
+ * like the normal direct-to-bio case as it could be
+ * temporary.
+ */
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ pr_err_ratelimited("Write error (%ld) on dio swapfile (%llu)\n",
+ ret, page_file_offset(page));
+ }
+ end_page_writeback(page);
+}
+
+static void swapfile_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct swapfile_kiocb *ki = container_of(iocb, struct swapfile_kiocb, iocb);
+
+ __swapfile_write_complete(iocb, ret, ret2);
+ swapfile_put_kiocb(ki);
+}
+
+static int swapfile_write_sync(struct swap_info_struct *sis,
+ struct page *page, struct writeback_control *wbc)
+{
+ struct kiocb kiocb;
+ struct file *swap_file = sis->swap_file;
+ struct bio_vec bv = {
+ .bv_page = page,
+ .bv_len = thp_size(page),
+ .bv_offset = 0
+ };
+ struct iov_iter from;
+ int ret;
+
+ init_sync_kiocb(&kiocb, swap_file);
+ kiocb.ki_swap_page = page;
+ kiocb.ki_pos = page_file_offset(page);
+ kiocb.ki_flags = IOCB_DIRECT | IOCB_WRITE | IOCB_SWAP;
+
+ set_page_writeback(page);
+ unlock_page(page);
+
+ iov_iter_bvec(&from, WRITE, &bv, 1, thp_size(page));
+ ret = swap_file->f_mapping->a_ops->direct_IO(&kiocb, &from);
+ __swapfile_write_complete(&kiocb, ret, 0);
+ return (ret > 0) ? 0 : ret;
+}
+
+static int swapfile_write(struct swap_info_struct *sis,
+ struct page *page, struct writeback_control *wbc)
+{
+ struct swapfile_kiocb *ki;
+ struct file *swap_file = sis->swap_file;
+ struct bio_vec bv = {
+ .bv_page = page,
+ .bv_len = thp_size(page),
+ .bv_offset = 0
+ };
+ struct iov_iter from;
+ int ret;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ return swapfile_write_sync(sis, page, wbc);
+
+ ki = kzalloc(sizeof(*ki), GFP_KERNEL);
+ if (!ki)
+ return -ENOMEM;
+
+ refcount_set(&ki->ki_refcnt, 2);
+ iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
+ init_sync_kiocb(&ki->iocb, swap_file);
+ ki->iocb.ki_swap_page = page;
+ ki->iocb.ki_pos = page_file_offset(page);
+ ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE | IOCB_SWAP;
+ ki->iocb.ki_complete = swapfile_write_complete;
+ get_file(swap_file);
+
+ set_page_writeback(page);
+ unlock_page(page);
+ ret = swap_file->f_mapping->a_ops->direct_IO(&ki->iocb, &from);
+
+ if (ret != -EIOCBQUEUED)
+ swapfile_write_complete(&ki->iocb, ret, 0);
+ swapfile_put_kiocb(ki);
+ return (ret > 0) ? 0 : ret;
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio;
@@ -307,47 +406,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc)
struct swap_info_struct *sis = page_swap_info(page);

VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct kiocb kiocb;
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct bio_vec bv = {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0
- };
- struct iov_iter from;
-
- iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
- init_sync_kiocb(&kiocb, swap_file);
- kiocb.ki_pos = page_file_offset(page);
- kiocb.ki_flags = IOCB_DIRECT | IOCB_WRITE | IOCB_SWAP;
-
- set_page_writeback(page);
- unlock_page(page);
- ret = mapping->a_ops->direct_IO(&kiocb, &from);
- if (ret == PAGE_SIZE) {
- count_vm_event(PSWPOUT);
- ret = 0;
- } else {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * folio_rotate_reclaimable but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
- */
- set_page_dirty(page);
- ClearPageReclaim(page);
- pr_err_ratelimited("Write error (%d) on dio swapfile (%llu)\n",
- ret, page_file_offset(page));
- }
- end_page_writeback(page);
- return ret;
- }
+ if (data_race(sis->flags & SWP_FS_OPS))
+ return swapfile_write(sis, page, wbc);

ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
@@ -410,7 +470,6 @@ static int swapfile_read_sync(struct swap_info_struct *sis, struct page *page)
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_swap_page = page;
kiocb.ki_pos = page_file_offset(page);
- kiocb.ki_filp = swap_file;
kiocb.ki_flags = IOCB_DIRECT | IOCB_SWAP;
/* Should set IOCB_HIPRI too, but the box becomes unresponsive whilst
* putting out occasional messages about the NFS sunrpc scheduling
@@ -449,8 +508,8 @@ static int swapfile_read(struct swap_info_struct *sis, struct page *page,
ki->iocb.ki_swap_page = page;
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_SWAP;
ki->iocb.ki_pos = page_file_offset(page);
- ki->iocb.ki_filp = get_file(swap_file);
ki->iocb.ki_complete = swapfile_read_complete;
+ get_file(swap_file);

iov_iter_bvec(&to, READ, &bv, 1, thp_size(page));
ret = swap_file->f_mapping->a_ops->direct_IO(&ki->iocb, &to);