Re: [PATCH v10 2/5] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Jan Kara
Date: Fri Jun 26 2026 - 12:58:00 EST
On Thu 25-06-26 23:29:38, Yun Zhou wrote:
> Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle
> can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem,
> creating a lock ordering issue during mount (!SB_ACTIVE).
>
> Add ext4_put_ea_inode() which uses iput_if_not_last() as a fast path.
> If this is not the last reference, it is dropped immediately. If this
> is the last reference, the inode is linked onto a per-sb lock-free llist
> via i_ea_iput_node (embedded in ext4_inode_info, sharing space with the
> unused xattr_sem of EA inodes via a union) and a delayed worker
> (1 jiffie) performs the final iput() in a clean context. This avoids
> per-iput memory allocation.
>
> Convert the first call site: ext4_xattr_block_set()'s "Drop the
> previous xattr block" path, which previously called
> ext4_xattr_inode_array_free() under xattr_sem + jbd2 handle.
>
> The worker is drained in ext4_put_super() before quota shutdown using
> a loop to handle re-arming (evicting an EA inode may queue further EA
> inodes). Initialization is placed before journal loading since fast
> commit replay may trigger evictions that call ext4_put_ea_inode().
>
> Signed-off-by: Yun Zhou <yun.zhou@xxxxxxxxxxxxx>
> Suggested-by: Jan Kara <jack@xxxxxxx>
Thanks for the patch. It looks mostly good. Some comments below.
> @@ -5497,6 +5505,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
> ext4_has_feature_orphan_present(sb) ||
> ext4_has_feature_journal_needs_recovery(sb));
>
> + ext4_init_ea_inode_work(sbi);
> +
> if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
> err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
> if (err)
> @@ -5508,6 +5518,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
> * The first inode we look at is the journal inode. Don't try
> * root first: it may be modified in the journal!
> */
> +
Stray addition of empty line.
> if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
> err = ext4_load_and_init_journal(sb, es, ctx);
> if (err)
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 982a1f831e22..ecdad5920b14 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
> static int
> ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
> struct inode *inode);
> +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
> + struct ext4_xattr_inode_array *array);
>
> #ifdef CONFIG_LOCKDEP
> void ext4_xattr_inode_set_class(struct inode *ea_inode)
> @@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
> ext4_xattr_release_block(handle, inode, bs->bh,
> &ea_inode_array,
> 0 /* extra_credits */);
> - ext4_xattr_inode_array_free(ea_inode_array);
> + ext4_xattr_inode_array_free_deferred(inode->i_sb,
> + ea_inode_array);
> }
> error = 0;
>
> @@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
> kfree(ea_inode_array);
> }
>
> +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
> + struct ext4_xattr_inode_array *array)
> +{
> + int idx;
> +
> + if (array == NULL)
> + return;
> +
> + for (idx = 0; idx < array->count; ++idx)
> + ext4_put_ea_inode(sb, array->inodes[idx]);
> + kfree(array);
> +}
It's strange to introduce this only to delete it two patches later. I'd
just introduce the mechanism in patch 1. Convert callsites in patch 2,
replace ext4_xattr_inode_array_free() mechanism in patch 3.
> +
> +/*
> + * Worker function for deferred EA inode iput. Processes all inodes queued
> + * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
> + */
> +static void ext4_ea_inode_work(struct work_struct *work)
> +{
> + struct ext4_sb_info *sbi = container_of(to_delayed_work(work),
> + struct ext4_sb_info,
> + s_ea_inode_work);
> + struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
> + struct llist_node *next;
> +
> + while (node) {
> + struct ext4_inode_info *ei = container_of(node,
> + struct ext4_inode_info, i_ea_iput_node);
> + next = node->next;
> + iput(&ei->vfs_inode);
> + node = next;
'next' is actually not needed in this function. You can directly do:
node = node->next;
> +/*
> + * Release a VFS reference on an EA inode. Must be used instead of iput()
> + * in any context where xattr_sem or a jbd2 handle is held.
> + *
> + * If this is not the last reference, drops it immediately via
> + * iput_if_not_last() with no further action needed.
> + *
> + * If this is the last reference, the inode is linked onto a per-sb
> + * llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space
> + * with the unused xattr_sem) and a delayed worker performs the final
> + * iput() in a clean context.
I'd add here: Note that if an inode is in s_ea_inode_to_free list, the
inode reference implicitely associated with that prevents
any future iput_if_not_last() from failing and so nobody will try to add
the inode to s_ea_inode_to_free for the second time until iput() in
ext4_ea_inode_work drops that reference.
> + */
> +void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
> +{
> + if (!inode)
> + return;
> + WARN_ON_ONCE(!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL));
> + if (iput_if_not_last(inode))
> + return;
> + llist_add(&EXT4_I(inode)->i_ea_iput_node,
> + &EXT4_SB(sb)->s_ea_inode_to_free);
> + /*
> + * Use a short delay to allow multiple EA inodes to accumulate,
> + * reducing workqueue wakeups when several are released together.
> + */
> + schedule_delayed_work(&EXT4_SB(sb)->s_ea_inode_work, 1);
> +}
> +
> +void ext4_init_ea_inode_work(struct ext4_sb_info *sbi)
> +{
> + init_llist_head(&sbi->s_ea_inode_to_free);
> + INIT_DELAYED_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work);
> +}
> +
> /*
> * ext4_xattr_block_cache_insert()
> *
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 1fedf44d4fb6..9883ba5569a1 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -190,6 +190,20 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
> struct ext4_xattr_inode_array **array,
> int extra_credits);
> extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
> +extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi);
> +extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
> +
> +/*
> + * Drain all pending deferred EA inode iputs. Must be called before
> + * freeing resources that eviction depends on (quota, block allocator).
> + * Loops because worker iput may trigger eviction that re-queues.
> + */
Can you please explain how iput of EA inode can trigger iput of another EA
inode? So far I don't think that's possible but perhaps I'm missing
something.
> +static inline void ext4_drain_ea_inode_work(struct ext4_sb_info *sbi)
> +{
> + while (flush_delayed_work(&sbi->s_ea_inode_work) ||
> + !llist_empty(&sbi->s_ea_inode_to_free))
> + ;
> +}
>
> extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
> struct ext4_inode *raw_inode, handle_t *handle);
Honza
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR