[PATCH v4 14/23] ext4: implement partial block zero range path using iomap
From: Zhang Yi
Date: Mon May 11 2026 - 03:37:40 EST
From: Zhang Yi <yi.zhang@xxxxxxxxxx>
Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
ext4_iomap_block_zero_range() to implement block zeroing via the iomap
infrastructure for ext4.
ext4_iomap_block_zero_range() calls iomap_zero_range() with
ext4_iomap_zero_begin() as the callback. The callback locates and zeros
out either a mapped partial block or a dirty, unwritten partial block.
Important constraints:
Zeroing out under an active journal handle can cause deadlock, because
the order of acquiring the folio lock and starting a handle is
inconsistent with the iomap writeback path.
Therefore, ext4_iomap_block_zero_range():
- Must NOT be called under an active handle.
- Cannot rely on data=ordered mode to ensure zeroed data persistence
before updating i_disksize (for the cases of post-EOF append write,
post-EOF fallocate, and truncate up). In subsequent patches, we will
address this by synchronizing commit I/O but doesn't waiting for
completion, and updating i_disksize to i_size only after the zeroed
data has been written back.
Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx>
---
fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 92 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c6fe42d012fc..e0dae2501292 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
return 0;
}
+static int ext4_iomap_zero_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
+ unsigned int iomap_flags = 0;
+ int ret;
+
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
+
+ if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
+ return -EINVAL;
+
+ ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Look up dirty folios for unwritten mappings within EOF. Providing
+ * this bypasses the flush iomap uses to trigger extent conversion
+ * when unwritten mappings have dirty pagecache in need of zeroing.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ loff_t start = ((loff_t)map.m_lblk) << blkbits;
+ loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
+
+ iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
+ if ((start >> blkbits) < map.m_lblk + map.m_len)
+ map.m_len = (start >> blkbits) - map.m_lblk;
+ }
+
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+ iomap->flags |= iomap_flags;
+
+ return 0;
+}
+
+static const struct iomap_ops ext4_iomap_zero_ops = {
+ .iomap_begin = ext4_iomap_zero_begin,
+};
+
/*
* Since we always allocate unwritten extents, there is no need for
* iomap_end to clean up allocated blocks on a short write.
@@ -4616,6 +4661,47 @@ static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
return err;
}
+static int ext4_block_iomap_zero_range(struct inode *inode, loff_t from,
+ loff_t length, bool *did_zero,
+ bool *zero_written)
+{
+ int ret;
+
+ /*
+ * Zeroing out under an active handle can cause deadlock since
+ * the order of acquiring the folio lock and starting a handle is
+ * inconsistent with the iomap writeback procedure.
+ */
+ if (WARN_ON_ONCE(ext4_handle_valid(journal_current_handle())))
+ return -EINVAL;
+
+ /* The zeroing scope should not extend across a block. */
+ if (WARN_ON_ONCE((from >> inode->i_blkbits) !=
+ ((from + length - 1) >> inode->i_blkbits)))
+ return -EINVAL;
+
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS) &&
+ !(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+
+ ret = iomap_zero_range(inode, from, length, did_zero,
+ &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
+ NULL);
+ if (ret)
+ return ret;
+
+ /*
+ * TODO: The iomap does not distinguish between different types of
+ * zeroing and always sets zero_written if a zeroing operation is
+ * performed, which may result in unnecessary order operations.
+ */
+ if (did_zero && zero_written)
+ *zero_written = *did_zero;
+
+ return 0;
+}
+
/*
* Zeros out a mapping of length 'length' starting from file offset
* 'from'. The range to be zero'd must be contained with in one block.
@@ -4642,6 +4728,9 @@ static int ext4_block_zero_range(struct inode *inode,
} else if (ext4_should_journal_data(inode)) {
return ext4_block_journalled_zero_range(inode, from, length,
did_zero);
+ } else if (ext4_inode_buffered_iomap(inode)) {
+ return ext4_block_iomap_zero_range(inode, from, length,
+ did_zero, zero_written);
}
return ext4_block_do_zero_range(inode, from, length, did_zero,
zero_written);
@@ -4682,6 +4771,9 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
* truncating up or performing an append write, because there might be
* exposing stale on-disk data which may caused by concurrent post-EOF
* mmap write during folio writeback.
+ *
+ * TODO: In the iomap path, handle this by updating i_disksize to
+ * i_size after the zeroed data has been written back.
*/
if (ext4_should_order_data(inode) &&
did_zero && zero_written && !IS_DAX(inode)) {
--
2.52.0