[PATCH v3 20/22] ext4: wait for ordered I/O in the iomap buffered I/O path

From: Zhang Yi

Date: Tue Apr 21 2026 - 22:21:58 EST


From: Zhang Yi <yi.zhang@xxxxxxxxxx>

Wait for ordered I/O to complete before updating i_disksize. This
ensures zeroed data is flushed to disk before the i_disksize metadata is
updated, preventing stale data exposure during unaligned post-EOF append
writes.

Suggested-by: Jan Kara <jack@xxxxxxx>
Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx>
---
fs/ext4/ext4.h | 11 +++++++++
fs/ext4/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++-----
fs/ext4/page-io.c | 53 ++++++++++++++++++++++++++++++++++++++++
fs/ext4/super.c | 23 +++++++++++++-----
4 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 60ba488b01c5..760400395cb7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1195,6 +1195,15 @@ struct ext4_inode_info {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_inode_info *i_crypt_info;
#endif
+
+ /*
+ * Track ordered zeroed data during post-EOF append writes, fallocate,
+ * and truncate-up operations. These parameters are used only in the
+ * iomap buffered I/O path.
+ */
+ ext4_lblk_t i_ordered_lblk;
+ ext4_lblk_t i_ordered_len;
+ wait_queue_head_t i_ordered_wq;
};

/*
@@ -3877,6 +3886,8 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 len, __u64 *moved_len);

/* page-io.c */
+#define EXT4_IOMAP_IOEND_ORDER_IO 1UL /* This I/O is an ordered one */
+
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d55899c1ef4c..17bd4403c782 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4352,12 +4352,37 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
{
struct iomap_ioend *ioend = wpc->wb_ctx;
struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+ ext4_lblk_t start, end, order_lblk, order_len;

/* Need to convert unwritten extents when I/Os are completed. */
if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
ioend->io_bio.bi_end_io = ext4_iomap_end_bio;

+ /*
+ * Mark the I/O as ordered. Ordered I/O requires separate endio
+ * handling and must not be merged with regular I/O operations.
+ */
+ order_len = READ_ONCE(ei->i_ordered_len);
+ if (order_len) {
+ /*
+ * Pair with smp_store_release() in ext4_block_zero_eof().
+ * Ensure we see the updated i_ordered_lblk that was written
+ * before the release store to i_ordered_len.
+ */
+ smp_rmb();
+ order_lblk = READ_ONCE(ei->i_ordered_lblk);
+ start = ioend->io_offset >> ioend->io_inode->i_blkbits;
+ end = EXT4_B_TO_LBLK(ioend->io_inode,
+ ioend->io_offset + ioend->io_size);
+
+ if (start <= order_lblk && end >= order_lblk + order_len) {
+ ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
+ ioend->io_private = (void *)EXT4_IOMAP_IOEND_ORDER_IO;
+ ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+ }
+ }
+
return iomap_ioend_writeback_submit(wpc, error);
}

@@ -4799,12 +4824,12 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
return err;
/*
* inodes using the iomap buffered I/O path do not use the
- * data=ordered mode. We submit zeroed range here.
- *
- * TODO: The end_io process needs to wait for I/O to completes
- * before updating i_disksize.
+ * data=ordered mode. Submit zeroed range here. The end_io
+ * handler ext4_iomap_wb_ordered_wait() will wait for I/O
+ * completion before updating i_disksize.
*/
} else if (ext4_inode_buffered_iomap(inode)) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct folio *folio;
bool do_submit = false;

@@ -4818,16 +4843,41 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
folio_wait_writeback(folio);
WARN_ON_ONCE(folio_test_writeback(folio));

- if (likely(folio_test_dirty(folio)))
+ /*
+ * Mark the ordered range. It will be cleared upon
+ * I/O completion in ext4_iomap_end_bio().
+ */
+ if (likely(folio_test_dirty(folio)) &&
+ READ_ONCE(ei->i_ordered_len) == 0) {
+ WRITE_ONCE(ei->i_ordered_lblk,
+ from >> inode->i_blkbits);
+ /*
+ * Pairs with smp_rmb() in
+ * ext4_iomap_writeback_submit() and
+ * ext4_iomap_wb_ordered_wait(). Ensure the
+ * updated i_ordered_lblk is visible when
+ * i_ordered_len becomes non-zero.
+ */
+ smp_store_release(&ei->i_ordered_len, 1);
do_submit = true;
+ }
folio_unlock(folio);
folio_put(folio);

if (do_submit) {
err = filemap_fdatawrite_range(inode->i_mapping,
from, end - 1);
- if (err)
+ if (err) {
+ /*
+ * Pairs with wait_event() in
+ * ext4_iomap_wb_ordered_wait(). Ensure
+ * i_ordered_len = 0 is visible before
+ * waking up waiters.
+ */
+ smp_store_release(&ei->i_ordered_len, 0);
+ wake_up_all(&ei->i_ordered_wq);
return err;
+ }
}
}
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 07978e2cd9c8..9c88671836fe 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -613,6 +613,39 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
return 0;
}

+/*
+ * If the old disk size is not block size aligned and the current
+ * writeback range is entirely beyond the old EOF block, we should
+ * wait for the zeroed data written in ext4_block_zero_eof() to be
+ * written out, otherwise, it may expose stale data in that block.
+ */
+static void ext4_iomap_wb_ordered_wait(struct inode *inode,
+ loff_t pos, loff_t end)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int blocksize = i_blocksize(inode);
+ loff_t disksize = READ_ONCE(ei->i_disksize);
+ ext4_lblk_t order_lblk, order_len;
+
+ if (!(disksize & (blocksize - 1)) ||
+ pos <= round_up(disksize, blocksize))
+ return;
+
+ order_len = READ_ONCE(ei->i_ordered_len);
+ if (!order_len)
+ return;
+
+ /*
+ * Pair with smp_store_release() in ext4_iomap_end_bio() and
+ * ext4_block_zero_eof(). Ensure we see the updated i_ordered_lblk
+ * that was written before the release store to i_ordered_len.
+ */
+ smp_rmb();
+ order_lblk = READ_ONCE(ei->i_ordered_lblk);
+ if ((pos >> inode->i_blkbits) >= order_lblk + order_len)
+ wait_event(ei->i_ordered_wq, READ_ONCE(ei->i_ordered_len) == 0);
+}
+
static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
loff_t end)
{
@@ -656,6 +689,9 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
goto out;
}

+ /* Wait ordered zero data to be written out. */
+ ext4_iomap_wb_ordered_wait(inode, pos, pos + size);
+
/* We may need to convert one extent and dirty the inode. */
credits = ext4_chunk_trans_blocks(inode,
EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
@@ -717,9 +753,26 @@ void ext4_iomap_end_bio(struct bio *bio)
struct inode *inode = ioend->io_inode;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long io_mode = (unsigned long)ioend->io_private;
unsigned long flags;
int ret;

+ /*
+ * This is an ordered I/O, clear the ordered range set in
+ * ext4_block_zero_eof() and wake up all waiters that will update
+ * the inode i_disksize.
+ */
+ if (io_mode == EXT4_IOMAP_IOEND_ORDER_IO) {
+ /*
+ * Pairs with wait_event() in ext4_iomap_wb_ordered_wait().
+ * Ensure i_ordered_len = 0 is visible before waking up
+ * waiters.
+ */
+ smp_store_release(&ei->i_ordered_len, 0);
+ wake_up_all(&ei->i_ordered_wq);
+ goto defer;
+ }
+
/* Needs to convert unwritten extents or update the i_disksize. */
if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b2da4834b6bb..2fc07739c9e8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1444,6 +1444,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ext4_fc_init_inode(&ei->vfs_inode);
spin_lock_init(&ei->i_fc_lock);
mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+ ei->i_ordered_lblk = 0;
+ ei->i_ordered_len = 0;
+ init_waitqueue_head(&ei->i_ordered_wq);
return &ei->vfs_inode;
}

@@ -1480,12 +1483,20 @@ static void ext4_destroy_inode(struct inode *inode)
dump_stack();
}

- if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
- WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
- ext4_msg(inode->i_sb, KERN_ERR,
- "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
- inode->i_ino, EXT4_I(inode),
- EXT4_I(inode)->i_reserved_data_blocks);
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS)) {
+ if (WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
+
+ if (WARN_ON_ONCE(EXT4_I(inode)->i_ordered_len))
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %llu (%p): i_ordered_lblk (%u) and i_ordered_len (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_ordered_lblk,
+ EXT4_I(inode)->i_ordered_len);
+ }
}

static void ext4_shutdown(struct super_block *sb)
--
2.52.0