[PATCH v11 5/5] ext4: prevent deadlock from duplicate EA inode references on corrupted fs
From: Yun Zhou
Date: Mon Jun 29 2026 - 07:10:52 EST
On a corrupted filesystem, multiple xattr entries within the same block
or ibody may reference the same EA inode. When
ext4_xattr_inode_dec_ref_all() processes such entries, it decrements the
EA inode's refcount to zero (clearing nlink) and releases it via
ext4_put_ea_inode(). If the deferred iput worker begins evicting this
inode before the loop encounters the duplicate entry, the second iget()
blocks on I_FREEING while the worker's eviction path waits for the
caller's jbd2 transaction to commit -- ABBA deadlock.
Fix by keeping a small on-stack array of EA inode numbers whose nlink
has dropped to zero within each dec_ref_all() invocation. Duplicate
entries referencing such inodes are skipped before iget, eliminating the
deadlock window entirely. The array starts at 32 entries (more than
sufficient for any valid xattr block) and grows dynamically with
GFP_NOFS | __GFP_NOFAIL for the pathological case.
For legitimate EA inode dedup (same ino appearing multiple times with
ref_count > 1), dec_ref only clears nlink when ref_count reaches zero,
so such entries are not added to the skip array and are still properly
decremented on subsequent encounters.
Signed-off-by: Yun Zhou <yun.zhou@xxxxxxxxxxxxx>
Tested-by: Xiao Wu <xiaowu.417@xxxxxx>
---
fs/ext4/xattr.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 55 insertions(+)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 01b9aff0cd25..85ad2561474b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1152,6 +1152,25 @@ static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
return 0;
}
+/* Track an EA inode number in the dedup array, expanding if needed. */
+static void ea_ino_array_add(unsigned int **ea_inos, unsigned int *nr,
+ unsigned int *size, unsigned int *inline_arr,
+ unsigned int ino)
+{
+ if (*nr == *size) {
+ unsigned int new_size = *size * 2;
+ unsigned int *p;
+
+ p = kmalloc_array(new_size, sizeof(*p), GFP_NOFS | __GFP_NOFAIL);
+ memcpy(p, *ea_inos, *nr * sizeof(*p));
+ if (*ea_inos != inline_arr)
+ kfree(*ea_inos);
+ *ea_inos = p;
+ *size = new_size;
+ }
+ (*ea_inos)[(*nr)++] = ino;
+}
+
static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
struct buffer_head *bh,
@@ -1166,6 +1185,10 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
int err;
int credits;
void *end;
+ unsigned int ea_inos_inline[32];
+ unsigned int *ea_inos = ea_inos_inline;
+ unsigned int nr_ea_inos = 0;
+ unsigned int ea_inos_size = ARRAY_SIZE(ea_inos_inline);
if (block_csum)
end = (void *)bh->b_data + bh->b_size;
@@ -1183,9 +1206,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
+ unsigned int i;
+
if (!entry->e_value_inum)
continue;
ea_ino = le32_to_cpu(entry->e_value_inum);
+
+ /*
+ * Skip EA inodes we already processed. On a corrupted fs,
+ * duplicate entries may point to the same EA inode; without
+ * this check, the second iget could deadlock on I_FREEING
+ * if the deferred worker already started evicting it.
+ */
+ for (i = 0; i < nr_ea_inos; i++)
+ if (ea_inos[i] == ea_ino)
+ break;
+ if (i < nr_ea_inos)
+ continue;
+
err = ext4_xattr_inode_iget(parent, ea_ino,
le32_to_cpu(entry->e_hash),
&ea_inode);
@@ -1218,6 +1256,11 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
if (err) {
ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
err);
+ /* Track if nlink==0 to skip duplicates (eviction risk) */
+ if (!ea_inode->i_nlink)
+ ea_ino_array_add(&ea_inos, &nr_ea_inos,
+ &ea_inos_size, ea_inos_inline, ea_ino);
+
ext4_put_ea_inode(parent->i_sb, ea_inode);
continue;
}
@@ -1235,10 +1278,22 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
entry->e_value_inum = 0;
entry->e_value_size = 0;
+ /*
+ * Track nlink==0 EA inodes to skip duplicates later --
+ * they risk I_FREEING deadlock on re-iget.
+ * Check nlink before put since put releases our reference.
+ */
+ if (!ea_inode->i_nlink)
+ ea_ino_array_add(&ea_inos, &nr_ea_inos,
+ &ea_inos_size, ea_inos_inline, ea_ino);
+
ext4_put_ea_inode(parent->i_sb, ea_inode);
dirty = true;
}
+ if (ea_inos != ea_inos_inline)
+ kfree(ea_inos);
+
if (dirty) {
/*
* Note that we are deliberately skipping csum calculation for
--
2.43.0