Re: [PATCH v11 5/5] ext4: prevent deadlock from duplicate EA inode references on corrupted fs
From: Jan Kara
Date: Mon Jun 29 2026 - 07:58:25 EST
On Mon 29-06-26 19:08:48, Yun Zhou wrote:
> On a corrupted filesystem, multiple xattr entries within the same block
> or ibody may reference the same EA inode. When
> ext4_xattr_inode_dec_ref_all() processes such entries, it decrements the
> EA inode's refcount to zero (clearing nlink) and releases it via
> ext4_put_ea_inode(). If the deferred iput worker begins evicting this
> inode before the loop encounters the duplicate entry, the second iget()
> blocks on I_FREEING while the worker's eviction path waits for the
> caller's jbd2 transaction to commit -- ABBA deadlock.
>
> Fix by keeping a small on-stack array of EA inode numbers whose nlink
> has dropped to zero within each dec_ref_all() invocation. Duplicate
> entries referencing such inodes are skipped before iget, eliminating the
> deadlock window entirely. The array starts at 32 entries (more than
> sufficient for any valid xattr block) and grows dynamically with
> GFP_NOFS | __GFP_NOFAIL for the pathological case.
No, this seems like rather incomplete fix to the problem AFAICT. Any
ext4_iget() of EA inode while having a transaction started can deadlock
like this, not just ext4_xattr_inode_dec_ref_all(). Thus a fix also has to
be more complete, not just local to ext4_xattr_inode_dec_ref_all(). As I
wrote, just drop this patch from now. We can look at fixing this problem
later.
Honza
>
> For legitimate EA inode dedup (same ino appearing multiple times with
> ref_count > 1), dec_ref only clears nlink when ref_count reaches zero,
> so such entries are not added to the skip array and are still properly
> decremented on subsequent encounters.
>
> Signed-off-by: Yun Zhou <yun.zhou@xxxxxxxxxxxxx>
> Tested-by: Xiao Wu <xiaowu.417@xxxxxx>
> ---
> fs/ext4/xattr.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 55 insertions(+)
>
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 01b9aff0cd25..85ad2561474b 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1152,6 +1152,25 @@ static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
> return 0;
> }
>
> +/* Track an EA inode number in the dedup array, expanding if needed. */
> +static void ea_ino_array_add(unsigned int **ea_inos, unsigned int *nr,
> + unsigned int *size, unsigned int *inline_arr,
> + unsigned int ino)
> +{
> + if (*nr == *size) {
> + unsigned int new_size = *size * 2;
> + unsigned int *p;
> +
> + p = kmalloc_array(new_size, sizeof(*p), GFP_NOFS | __GFP_NOFAIL);
> + memcpy(p, *ea_inos, *nr * sizeof(*p));
> + if (*ea_inos != inline_arr)
> + kfree(*ea_inos);
> + *ea_inos = p;
> + *size = new_size;
> + }
> + (*ea_inos)[(*nr)++] = ino;
> +}
> +
> static void
> ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> struct buffer_head *bh,
> @@ -1166,6 +1185,10 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> int err;
> int credits;
> void *end;
> + unsigned int ea_inos_inline[32];
> + unsigned int *ea_inos = ea_inos_inline;
> + unsigned int nr_ea_inos = 0;
> + unsigned int ea_inos_size = ARRAY_SIZE(ea_inos_inline);
>
> if (block_csum)
> end = (void *)bh->b_data + bh->b_size;
> @@ -1183,9 +1206,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
>
> for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
> entry = EXT4_XATTR_NEXT(entry)) {
> + unsigned int i;
> +
> if (!entry->e_value_inum)
> continue;
> ea_ino = le32_to_cpu(entry->e_value_inum);
> +
> + /*
> + * Skip EA inodes we already processed. On a corrupted fs,
> + * duplicate entries may point to the same EA inode; without
> + * this check, the second iget could deadlock on I_FREEING
> + * if the deferred worker already started evicting it.
> + */
> + for (i = 0; i < nr_ea_inos; i++)
> + if (ea_inos[i] == ea_ino)
> + break;
> + if (i < nr_ea_inos)
> + continue;
> +
> err = ext4_xattr_inode_iget(parent, ea_ino,
> le32_to_cpu(entry->e_hash),
> &ea_inode);
> @@ -1218,6 +1256,11 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> if (err) {
> ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
> err);
> + /* Track if nlink==0 to skip duplicates (eviction risk) */
> + if (!ea_inode->i_nlink)
> + ea_ino_array_add(&ea_inos, &nr_ea_inos,
> + &ea_inos_size, ea_inos_inline, ea_ino);
> +
> ext4_put_ea_inode(parent->i_sb, ea_inode);
> continue;
> }
> @@ -1235,10 +1278,22 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
> entry->e_value_inum = 0;
> entry->e_value_size = 0;
>
> + /*
> + * Track nlink==0 EA inodes to skip duplicates later --
> + * they risk I_FREEING deadlock on re-iget.
> + * Check nlink before put since put releases our reference.
> + */
> + if (!ea_inode->i_nlink)
> + ea_ino_array_add(&ea_inos, &nr_ea_inos,
> + &ea_inos_size, ea_inos_inline, ea_ino);
> +
> ext4_put_ea_inode(parent->i_sb, ea_inode);
> dirty = true;
> }
>
> + if (ea_inos != ea_inos_inline)
> + kfree(ea_inos);
> +
> if (dirty) {
> /*
> * Note that we are deliberately skipping csum calculation for
> --
> 2.43.0
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR