[PATCH RFC 11/11] fs: move generic_file_write_iter() family to fs/read_write.c

From: Tal Zussman

Date: Wed May 20 2026 - 16:55:47 EST


Move the VFS-level generic write path out of mm/filemap.c into
fs/read_write.c next to the just-relocated read path:

- generic_file_write_iter()
- __generic_file_write_iter()
- generic_file_direct_write()
- generic_perform_write()
- kiocb_invalidate_pages()
- kiocb_invalidate_post_direct_write()
- dio_warn_stale_pagecache()

The kiocb_invalidate_* prototypes move from <linux/pagemap.h> to
<linux/fs.h>, joining kiocb_write_and_wait() and the other generic
read/write declarations. Drop extern from the prototypes of all
five generic_file_* declarations in <linux/fs.h>. Reflow the
generic_file_direct_write() definition to fit on one line.

Signed-off-by: Tal Zussman <tz2294@xxxxxxxxxxxx>
---
fs/read_write.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 8 +-
include/linux/pagemap.h | 2 -
mm/filemap.c | 277 ------------------------------------------------
4 files changed, 281 insertions(+), 282 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 59ceea85c163..cea5f79fdacf 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1071,6 +1071,282 @@ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
EXPORT_SYMBOL(generic_file_read_iter);

+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ return filemap_invalidate_pages(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1,
+ iocb->ki_flags & IOCB_NOWAIT);
+}
+EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
+
+/*
+ * Warn about a page cache invalidation failure during a direct I/O write.
+ */
+static void dio_warn_stale_pagecache(struct file *filp)
+{
+ static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
+ char pathname[128];
+ char *path;
+
+ errseq_set(&filp->f_mapping->wb_err, -EIO);
+ if (__ratelimit(&_rs)) {
+ path = file_path(filp, pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
+ pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+ current->comm);
+ }
+}
+
+void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ if (mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
+ dio_warn_stale_pagecache(iocb->ki_filp);
+}
+
+ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ size_t write_len = iov_iter_count(from);
+ ssize_t written;
+
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ written = kiocb_invalidate_pages(iocb, write_len);
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ return written;
+ }
+
+ written = mapping->a_ops->direct_IO(iocb, from);
+
+ /*
+ * Finally, try again to invalidate clean pages which might have been
+ * cached by non-direct readahead, or faulted in by get_user_pages()
+ * if the source of the write was an mmap'ed region of the file
+ * we're writing. Either one is a pretty crazy thing to do,
+ * so we don't support it 100%. If this invalidation
+ * fails, tough, the write still worked...
+ *
+ * Most of the time we do not need this since dio_complete() will do
+ * the invalidation for us. However there are some file systems that
+ * do not end up with dio_complete() being called, so let's not break
+ * them by removing it completely.
+ *
+ * Noticeable example is a blkdev_direct_IO().
+ *
+ * Skip invalidation for async writes or if mapping has no pages.
+ */
+ if (written > 0) {
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+
+ kiocb_invalidate_post_direct_write(iocb, written);
+ pos += written;
+ write_len -= written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ iocb->ki_pos = pos;
+ }
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, write_len - iov_iter_count(from));
+ return written;
+}
+EXPORT_SYMBOL(generic_file_direct_write);
+
+ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos = iocb->ki_pos;
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ size_t chunk = mapping_max_folio_size(mapping);
+ long status = 0;
+ ssize_t written = 0;
+
+ do {
+ struct folio *folio;
+ size_t offset; /* Offset into folio */
+ size_t bytes; /* Bytes to write to folio */
+ size_t copied; /* Bytes copied from user */
+ void *fsdata = NULL;
+
+ bytes = iov_iter_count(i);
+retry:
+ offset = pos & (chunk - 1);
+ bytes = min(chunk - offset, bytes);
+ balance_dirty_pages_ratelimited(mapping);
+
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+
+ status = a_ops->write_begin(iocb, mapping, pos, bytes,
+ &folio, &fsdata);
+ if (unlikely(status < 0))
+ break;
+
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
+ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
+ flush_dcache_folio(folio);
+
+ status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
+ folio, fsdata);
+ if (unlikely(status != copied)) {
+ iov_iter_revert(i, copied - max(status, 0L));
+ if (unlikely(status < 0))
+ break;
+ }
+ cond_resched();
+
+ if (unlikely(status == 0)) {
+ /*
+ * A short copy made ->write_end() reject the
+ * thing entirely. Might be memory poisoning
+ * halfway through, might be a race with munmap,
+ * might be severe memory pressure.
+ */
+ if (chunk > PAGE_SIZE)
+ chunk /= 2;
+ if (copied) {
+ bytes = copied;
+ goto retry;
+ }
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
+ } else {
+ pos += status;
+ written += status;
+ }
+ } while (iov_iter_count(i));
+
+ if (!written)
+ return status;
+ iocb->ki_pos += written;
+ return written;
+}
+EXPORT_SYMBOL(generic_perform_write);
+
+/**
+ * __generic_file_write_iter - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
+ * object which does not need locking at all.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = generic_file_direct_write(iocb, from);
+ /*
+ * If the write stopped short of completing, fall back to
+ * buffered writes. Some filesystems do this for writes to
+ * holes, for example. For DAX files, a buffered write will
+ * not succeed (even if it did, DAX does not handle dirty
+ * page-cache pages correctly).
+ */
+ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
+ return ret;
+ return direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
+ }
+
+ return generic_perform_write(iocb, from);
+}
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+/**
+ * generic_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * This is a wrapper around __generic_file_write_iter() to be used by most
+ * filesystems. It takes care of syncing the file in case of O_SYNC file
+ * and acquires i_rwsem as needed.
+ * Return:
+ * * negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * number of bytes written, even for truncated writes
+ */
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = __generic_file_write_iter(iocb, from);
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, rwf_t flags)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0151ced8e7a..6cfb9e46bc37 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3057,9 +3057,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
ssize_t already_read);
ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);
-extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
-extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
-extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
+void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
+ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
+ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
+ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
ssize_t direct_written, ssize_t buffered_written);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 46cefd552a51..b7c2dc8076ab 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -31,8 +31,6 @@ static inline void invalidate_remote_inode(struct inode *inode)
int invalidate_inode_pages2(struct address_space *mapping);
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
-int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
-void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait);

diff --git a/mm/filemap.c b/mm/filemap.c
index db7c53cd681b..284c0296a011 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2276,17 +2276,6 @@ int filemap_invalidate_pages(struct address_space *mapping,
end >> PAGE_SHIFT);
}

-int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- return filemap_invalidate_pages(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1,
- iocb->ki_flags & IOCB_NOWAIT);
-}
-EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
-
-
/*
* Splice subpages from a folio into a pipe.
*/
@@ -3500,272 +3489,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page_gfp);

-/*
- * Warn about a page cache invalidation failure during a direct I/O write.
- */
-static void dio_warn_stale_pagecache(struct file *filp)
-{
- static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
- char pathname[128];
- char *path;
-
- errseq_set(&filp->f_mapping->wb_err, -EIO);
- if (__ratelimit(&_rs)) {
- path = file_path(filp, pathname, sizeof(pathname));
- if (IS_ERR(path))
- path = "(unknown)";
- pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
- pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
- current->comm);
- }
-}
-
-void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- if (mapping->nrpages &&
- invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT,
- (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
- dio_warn_stale_pagecache(iocb->ki_filp);
-}
-
-ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- size_t write_len = iov_iter_count(from);
- ssize_t written;
-
- /*
- * If a page can not be invalidated, return 0 to fall back
- * to buffered write.
- */
- written = kiocb_invalidate_pages(iocb, write_len);
- if (written) {
- if (written == -EBUSY)
- return 0;
- return written;
- }
-
- written = mapping->a_ops->direct_IO(iocb, from);
-
- /*
- * Finally, try again to invalidate clean pages which might have been
- * cached by non-direct readahead, or faulted in by get_user_pages()
- * if the source of the write was an mmap'ed region of the file
- * we're writing. Either one is a pretty crazy thing to do,
- * so we don't support it 100%. If this invalidation
- * fails, tough, the write still worked...
- *
- * Most of the time we do not need this since dio_complete() will do
- * the invalidation for us. However there are some file systems that
- * do not end up with dio_complete() being called, so let's not break
- * them by removing it completely.
- *
- * Noticeable example is a blkdev_direct_IO().
- *
- * Skip invalidation for async writes or if mapping has no pages.
- */
- if (written > 0) {
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
-
- kiocb_invalidate_post_direct_write(iocb, written);
- pos += written;
- write_len -= written;
- if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- iocb->ki_pos = pos;
- }
- if (written != -EIOCBQUEUED)
- iov_iter_revert(from, write_len - iov_iter_count(from));
- return written;
-}
-EXPORT_SYMBOL(generic_file_direct_write);
-
-ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- struct address_space *mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- size_t chunk = mapping_max_folio_size(mapping);
- long status = 0;
- ssize_t written = 0;
-
- do {
- struct folio *folio;
- size_t offset; /* Offset into folio */
- size_t bytes; /* Bytes to write to folio */
- size_t copied; /* Bytes copied from user */
- void *fsdata = NULL;
-
- bytes = iov_iter_count(i);
-retry:
- offset = pos & (chunk - 1);
- bytes = min(chunk - offset, bytes);
- balance_dirty_pages_ratelimited(mapping);
-
- if (fatal_signal_pending(current)) {
- status = -EINTR;
- break;
- }
-
- status = a_ops->write_begin(iocb, mapping, pos, bytes,
- &folio, &fsdata);
- if (unlikely(status < 0))
- break;
-
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
-
- if (mapping_writably_mapped(mapping))
- flush_dcache_folio(folio);
-
- /*
- * Faults here on mmap()s can recurse into arbitrary
- * filesystem code. Lots of locks are held that can
- * deadlock. Use an atomic copy to avoid deadlocking
- * in page fault handling.
- */
- copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- flush_dcache_folio(folio);
-
- status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
- folio, fsdata);
- if (unlikely(status != copied)) {
- iov_iter_revert(i, copied - max(status, 0L));
- if (unlikely(status < 0))
- break;
- }
- cond_resched();
-
- if (unlikely(status == 0)) {
- /*
- * A short copy made ->write_end() reject the
- * thing entirely. Might be memory poisoning
- * halfway through, might be a race with munmap,
- * might be severe memory pressure.
- */
- if (chunk > PAGE_SIZE)
- chunk /= 2;
- if (copied) {
- bytes = copied;
- goto retry;
- }
-
- /*
- * 'folio' is now unlocked and faults on it can be
- * handled. Ensure forward progress by trying to
- * fault it in now.
- */
- if (fault_in_iov_iter_readable(i, bytes) == bytes) {
- status = -EFAULT;
- break;
- }
- } else {
- pos += status;
- written += status;
- }
- } while (iov_iter_count(i));
-
- if (!written)
- return status;
- iocb->ki_pos += written;
- return written;
-}
-EXPORT_SYMBOL(generic_perform_write);
-
-/**
- * __generic_file_write_iter - write data to a file
- * @iocb: IO state structure (file, offset, etc.)
- * @from: iov_iter with data to write
- *
- * This function does all the work needed for actually writing data to a
- * file. It does all basic checks, removes SUID from the file, updates
- * modification times and calls proper subroutines depending on whether we
- * do direct IO or a standard buffered write.
- *
- * It expects i_rwsem to be grabbed unless we work on a block device or similar
- * object which does not need locking at all.
- *
- * This function does *not* take care of syncing data in case of O_SYNC write.
- * A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_rwsem.
- *
- * Return:
- * * number of bytes written, even for truncated writes
- * * negative error code if no data has been written at all
- */
-ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
-
- ret = file_remove_privs(file);
- if (ret)
- return ret;
-
- ret = file_update_time(file);
- if (ret)
- return ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = generic_file_direct_write(iocb, from);
- /*
- * If the write stopped short of completing, fall back to
- * buffered writes. Some filesystems do this for writes to
- * holes, for example. For DAX files, a buffered write will
- * not succeed (even if it did, DAX does not handle dirty
- * page-cache pages correctly).
- */
- if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
- return ret;
- return direct_write_fallback(iocb, from, ret,
- generic_perform_write(iocb, from));
- }
-
- return generic_perform_write(iocb, from);
-}
-EXPORT_SYMBOL(__generic_file_write_iter);
-
-/**
- * generic_file_write_iter - write data to a file
- * @iocb: IO state structure
- * @from: iov_iter with data to write
- *
- * This is a wrapper around __generic_file_write_iter() to be used by most
- * filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_rwsem as needed.
- * Return:
- * * negative error code if no data has been written at all of
- * vfs_fsync_range() failed for a synchronous write
- * * number of bytes written, even for truncated writes
- */
-ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- ssize_t ret;
-
- inode_lock(inode);
- ret = generic_write_checks(iocb, from);
- if (ret > 0)
- ret = __generic_file_write_iter(iocb, from);
- inode_unlock(inode);
-
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_write_iter);

/**
* filemap_release_folio() - Release fs-specific metadata on a folio.

--
2.39.5