[PATCH v3 21/22] ext4: update i_disksize to i_size on ordered I/O completion

From: Zhang Yi

Date: Tue Apr 21 2026 - 22:26:01 EST

From: Zhang Yi <yi.zhang@xxxxxxxxxx>

Currently, i_disksize is updated after ordered data writeback to prevent
exposing stale data in the post-EOF block. However, operations like
fallocate and truncate update i_disksize directly. If the new i_disksize
exceeds the original value, metadata may be written back before the
zeroed data is persisted. To avoid this, we defer i_disksize updates
when i_ordered_len is non-zero, only applying them after ordered I/O
completes.

But this deferral introduces a new problem: on ordered I/O completion,
i_disksize is updated only to the end of that specific I/O, discarding
any later updates (e.g., from fallocate) and causing filesystem
inconsistency. A potential fix would involve scanning for dirty or
writeback folios beyond the current position, then updating i_disksize
to the start of the first such folio or to i_size. However, folio
scanning is expensive and concurrency with operations like fallocate
makes this approach prohibitively complex.

Instead, update i_disksize directly to i_size upon ordered I/O
completion. This may expose zeroed data if dirty data within the range
is not yet written to disk after crash recovery, but it will never
expose stale data. The is limited to unaligned append writes and is
deemed acceptable.

Suggested-by: Jan Kara <jack@xxxxxxx>
Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx>
---
fs/ext4/ext4.h | 40 +++++++++++++++++++++++++++++++---------
fs/ext4/extents.c | 9 +++------
fs/ext4/inode.c | 3 ---
fs/ext4/page-io.c | 23 ++++++++++++++++++-----
4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 760400395cb7..59dcec47675f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3495,13 +3495,21 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

-/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
+/*
+ * Update i_disksize. Requires i_rwsem to avoid races with truncate.
+ *
+ * In the iomap buffered I/O path, a non-zero i_ordered_len indicates that
+ * an ordered I/O (zeroing the EOF partial block) is still in progress.
+ * In that case, i_disksize will be updated after the ordered data has
+ * been written out.
+ */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
!inode_is_locked(inode));
down_write(&EXT4_I(inode)->i_data_sem);
- if (newsize > EXT4_I(inode)->i_disksize)
+ if (newsize > EXT4_I(inode)->i_disksize &&
+ READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0)
WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
up_write(&EXT4_I(inode)->i_data_sem);
}
@@ -3515,7 +3523,8 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
i_size_write(inode, newsize);
changed = 1;
}
- if (newsize > EXT4_I(inode)->i_disksize) {
+ if (newsize > EXT4_I(inode)->i_disksize &&
+ READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0) {
ext4_update_i_disksize(inode, newsize);
changed |= 2;
}
@@ -3523,19 +3532,32 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
}

/*
- * Set i_size and i_disksize to 'newsize'.
+ * Set i_size and i_disksize to 'newsize'. In the iomap buffered I/O path,
+ * if i_ordered_len is non-zero and newsize exceeds the current i_disksize,
+ * the actual i_disksize update is deferred until after the ordered data is
+ * written out. In that case, i_disksize will be set to i_size upon I/O
+ * completion.
*
* Both i_rwsem and i_data_sem are required here to avoid races between
- * generic append writeback and concurrent truncate that also modify
- * i_size and i_disksize.
+ * generic append writeback (or ordered I/O writeback) and concurrent
+ * operations like fallocate and truncate that also modify i_size and
+ * i_disksize.
*/
-static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
+static inline void __ext4_set_inode_size(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode));
+ WARN_ON_ONCE(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));

- down_write(&EXT4_I(inode)->i_data_sem);
i_size_write(inode, newsize);
- EXT4_I(inode)->i_disksize = newsize;
+ if (READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0 ||
+ newsize < EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+}
+
+static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
+{
+ down_write(&EXT4_I(inode)->i_data_sem);
+ __ext4_set_inode_size(inode, newsize);
up_write(&EXT4_I(inode)->i_data_sem);
}

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 125f628e738a..e0c36cd920bf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5531,7 +5531,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t start, new_size;
+ loff_t start;
int ret;

trace_ext4_collapse_range(inode, offset, len);
@@ -5597,9 +5597,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
goto out_handle;
}

- new_size = inode->i_size - len;
- i_size_write(inode, new_size);
- EXT4_I(inode)->i_disksize = new_size;
+ __ext4_set_inode_size(inode, inode->i_size - len);

up_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_mark_inode_dirty(handle, inode);
@@ -5671,8 +5669,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

/* Expand file to avoid data loss if there is error while shifting */
- inode->i_size += len;
- EXT4_I(inode)->i_disksize += len;
+ ext4_set_inode_size(inode, inode->i_size + len);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_handle;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17bd4403c782..d983336390c7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4805,9 +4805,6 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
* truncating up or performing an append write, because there might be
* exposing stale on-disk data which may caused by concurrent post-EOF
* mmap write during folio writeback.
- *
- * TODO: In the iomap path, handle this by updating i_disksize to
- * i_size after the zeroed data has been written back.
*/
if (did_zero && zero_written && !IS_DAX(inode)) {
if (ext4_should_order_data(inode)) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9c88671836fe..589c74b9f8a3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -647,13 +647,13 @@ static void ext4_iomap_wb_ordered_wait(struct inode *inode,
}

static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
- loff_t end)
+ loff_t end, bool is_ordered)
{
- loff_t new_disksize = end;
+ loff_t new_disksize, i_size;
struct ext4_inode_info *ei = EXT4_I(inode);
int ret;

- if (new_disksize <= READ_ONCE(ei->i_disksize))
+ if (end <= READ_ONCE(ei->i_disksize) && !is_ordered)
return 0;

/*
@@ -661,7 +661,18 @@ static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
* are avoided by checking i_size under i_data_sem.
*/
down_write(&ei->i_data_sem);
- new_disksize = min(new_disksize, i_size_read(inode));
+ i_size = i_size_read(inode);
+
+ /*
+ * Update i_disksize to i_size when completing an ordered I/O that
+ * zeroes the old EOF partial block. This ensures i_disksize is
+ * correctly advanced during truncate-up on a blocksize-unaligned
+ * file, preventing it from remaining stale. A downside is that
+ * zeroed data may be exposed after crash recovery if the dirty
+ * data in this range is not yet on disk, but stale data will
+ * never be exposed.
+ */
+ new_disksize = is_ordered ? i_size : min(end, i_size);
if (new_disksize > ei->i_disksize)
ei->i_disksize = new_disksize;
up_write(&ei->i_data_sem);
@@ -678,6 +689,7 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
struct super_block *sb = inode->i_sb;
loff_t pos = ioend->io_offset;
size_t size = ioend->io_size;
+ unsigned long io_mode = (unsigned long)ioend->io_private;
handle_t *handle;
int credits;
int ret, err;
@@ -707,7 +719,8 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
goto out_journal;
}

- ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size);
+ ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size,
+ io_mode == EXT4_IOMAP_IOEND_ORDER_IO);
out_journal:
err = ext4_journal_stop(handle);
if (!ret)
--
2.52.0