[PATCH v7 3/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Yun Zhou
Date: Tue Jun 16 2026 - 11:16:57 EST
Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle
can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem,
creating a lock ordering issue during mount (!SB_ACTIVE).
Add ext4_put_ea_inode() which safely releases EA inode references:
when SB_ACTIVE, it calls iput() directly (write_inode_now cannot be
triggered); during mount (!SB_ACTIVE), it queues the inode on a per-sb
lock-free llist and schedules a worker to call iput() in a clean
context without holding any ext4 locks.
Convert the iput in ext4_xattr_block_set()'s "Drop the previous xattr
block" path to use ext4_xattr_inode_array_free_deferred(), which
releases EA inodes via ext4_put_ea_inode(). This path previously called
ext4_xattr_inode_array_free() (synchronous iput) while holding xattr_sem
and a jbd2 handle.
The worker is flushed in ext4_put_super() before journal destruction to
ensure all pending EA inode cleanup completes while the journal is still
available.
Signed-off-by: Yun Zhou <yun.zhou@xxxxxxxxxxxxx>
---
fs/ext4/ext4.h | 5 ++++
fs/ext4/super.c | 6 ++++
fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/xattr.h | 2 ++
4 files changed, 85 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..690202303269 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1706,6 +1706,11 @@ struct ext4_sb_info {
struct ext4_es_stats s_es_stats;
struct mb_cache *s_ea_block_cache;
struct mb_cache *s_ea_inode_cache;
+
+ /* Deferred iput for EA inodes to avoid lock ordering issues */
+ struct llist_head s_ea_inode_to_free;
+ struct work_struct s_ea_inode_work;
+
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
/* Journal triggers for checksum computation */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..b777bb0a81ea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1308,6 +1308,9 @@ static void ext4_put_super(struct super_block *sb)
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
+ /* Flush deferred EA inode iputs before destroying journal */
+ flush_work(&sbi->s_ea_inode_work);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = ext4_journal_destroy(sbi, sbi->s_journal);
@@ -5535,6 +5538,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
needs_recovery = 0;
}
+ init_llist_head(&sbi->s_ea_inode_to_free);
+ INIT_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work);
+
if (!test_opt(sb, NO_MBCACHE)) {
sbi->s_ea_block_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..04e7f674340d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
struct inode *inode);
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+ struct ext4_xattr_inode_array *array);
#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
@@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
ext4_xattr_release_block(handle, inode, bs->bh,
&ea_inode_array,
0 /* extra_credits */);
- ext4_xattr_inode_array_free(ea_inode_array);
+ ext4_xattr_inode_array_free_deferred(inode->i_sb,
+ ea_inode_array);
}
error = 0;
@@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
kfree(ea_inode_array);
}
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+ struct ext4_xattr_inode_array *array)
+{
+ int idx;
+
+ if (array == NULL)
+ return;
+
+ for (idx = 0; idx < array->count; ++idx)
+ ext4_put_ea_inode(sb, array->inodes[idx]);
+ kfree(array);
+}
+
+struct ext4_ea_iput_entry {
+ struct llist_node node;
+ struct inode *inode;
+};
+
+/*
+ * Worker function for deferred EA inode iput. Processes all inodes queued
+ * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
+ */
+void ext4_ea_inode_work(struct work_struct *work)
+{
+ struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+ s_ea_inode_work);
+ struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
+ struct llist_node *next;
+
+ while (node) {
+ struct ext4_ea_iput_entry *entry = container_of(node,
+ struct ext4_ea_iput_entry, node);
+ next = node->next;
+ iput(entry->inode);
+ kfree(entry);
+ node = next;
+ }
+}
+
+/*
+ * Release a VFS reference on an EA inode after ext4_xattr_inode_dec_ref()
+ * may have set i_nlink=0. Must be used instead of iput() in any context
+ * where xattr_sem or a jbd2 handle is held, because eviction of a nlink=0
+ * inode can acquire those same locks.
+ *
+ * When SB_ACTIVE, eviction does not call write_inode_now() so direct
+ * iput() is safe. During mount (!SB_ACTIVE), defer to a workqueue.
+ *
+ * For EA inode references dropped without a preceding dec_ref (e.g.,
+ * lookup-only paths where nlink remains >= 1), plain iput() is safe
+ * and preferred.
+ */
+void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
+{
+ struct ext4_ea_iput_entry *entry;
+
+ if (!inode)
+ return;
+ if (sb->s_flags & SB_ACTIVE) {
+ iput(inode);
+ return;
+ }
+ entry = kmalloc(sizeof(*entry), GFP_NOFS | __GFP_NOFAIL);
+ entry->inode = inode;
+ llist_add(&entry->node, &EXT4_SB(sb)->s_ea_inode_to_free);
+ schedule_work(&EXT4_SB(sb)->s_ea_inode_work);
+}
+
/*
* ext4_xattr_block_cache_insert()
*
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1fedf44d4fb6..52074537dce5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -190,6 +190,8 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
struct ext4_xattr_inode_array **array,
int extra_credits);
extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
+extern void ext4_ea_inode_work(struct work_struct *work);
+extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
--
2.43.0