[RFC v8 4/7] ext4: fast commit: avoid self-deadlock in inode snapshotting

From: Li Chen

Date: Fri May 15 2026 - 05:32:44 EST


ext4_fc_snapshot_inodes() used igrab()/iput() to pin inodes while building
commit-time snapshots. With ext4_fc_del() waiting for
EXT4_STATE_FC_COMMITTING, iput() can trigger
ext4_clear_inode()->ext4_fc_del() in the commit thread and deadlock waiting
for the fast commit to finish.

ext4_fc_del() also has to re-check EXT4_STATE_FC_COMMITTING after
waiting on EXT4_STATE_FC_FLUSHING_DATA. The commit thread clears
FLUSHING_DATA before it sets COMMITTING, so a waiter woken from the
flush wait must not delete the inode based on an old COMMITTING
check.

Avoid taking extra references. Collect inode pointers under s_fc_lock and
rely on EXT4_STATE_FC_COMMITTING to pin inodes until ext4_fc_cleanup()
clears the bit.

Also set EXT4_STATE_FC_COMMITTING for create-only inodes referenced
from the dentry update queue, and wake up waiters when ext4_fc_cleanup()
clears the bit.

Signed-off-by: Li Chen <chenl311@xxxxxxxxxxxxxxx>
---
Changes in v8:
- Factor out small ext4_fc_wait_inode_state()/ext4_fc_wake_inode_state()
helpers so the repeated FC state wait/wake mapping is kept in one place.
- Re-check EXT4_STATE_FC_COMMITTING after waking from
EXT4_STATE_FC_FLUSHING_DATA in ext4_fc_del(), so list deletion only
happens after both predicates pass under the same s_fc_lock critical
section.

fs/ext4/fast_commit.c | 124 +++++++++++++++++++++++++-----------------
1 file changed, 75 insertions(+), 49 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 673668860e2d..8a6981e50ffe 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -235,6 +235,37 @@ static bool ext4_fc_eligible(struct super_block *sb)
!(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE));
}

+/*
+ * Wait for an inode fast-commit state bit to clear while dropping the
+ * fast-commit lock around schedule().
+ */
+static void ext4_fc_wait_inode_state(struct inode *inode, int bit,
+ int *alloc_ctx)
+{
+ wait_queue_head_t *wq;
+ unsigned long *wait_word = ext4_inode_state_wait_word(inode);
+ int wait_bit = ext4_inode_state_wait_bit(bit);
+
+ while (ext4_test_inode_state(inode, bit)) {
+ DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
+
+ wq = bit_waitqueue(wait_word, wait_bit);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, bit)) {
+ ext4_fc_unlock(inode->i_sb, *alloc_ctx);
+ schedule();
+ *alloc_ctx = ext4_fc_lock(inode->i_sb);
+ }
+ finish_wait(wq, &wait.wq_entry);
+ }
+}
+
+static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
+{
+ wake_up_bit(ext4_inode_state_wait_word(inode),
+ ext4_inode_state_wait_bit(bit));
+}
+
/*
* Remove inode from fast commit list. If the inode is being committed
* we wait until inode commit is done.
@@ -243,12 +274,6 @@ void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_fc_dentry_update *fc_dentry;
- wait_queue_head_t *wq;
- unsigned long *wait_word = ext4_inode_state_wait_word(inode);
- int committing_wait_bit =
- ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
- int flushing_wait_bit =
- ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
int alloc_ctx;

if (ext4_fc_disabled(inode->i_sb))
@@ -263,32 +288,19 @@ void ext4_fc_del(struct inode *inode)

/*
* Wait for ongoing fast commit to finish. We cannot remove the inode
- * from fast commit lists while it is being committed.
+ * from fast commit lists while it is being committed. If we wake from
+ * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the
+ * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA.
*/
- while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- DEFINE_WAIT_BIT(wait, wait_word, committing_wait_bit);
+ for (;;) {
+ ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING,
+ &alloc_ctx);

- wq = bit_waitqueue(wait_word, committing_wait_bit);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_unlock(inode->i_sb, alloc_ctx);
- schedule();
- alloc_ctx = ext4_fc_lock(inode->i_sb);
- }
- finish_wait(wq, &wait.wq_entry);
- }
-
- while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
- DEFINE_WAIT_BIT(wait, wait_word, flushing_wait_bit);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA))
+ break;

- wq = bit_waitqueue(wait_word, flushing_wait_bit);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
- ext4_fc_unlock(inode->i_sb, alloc_ctx);
- schedule();
- alloc_ctx = ext4_fc_lock(inode->i_sb);
- }
- finish_wait(wq, &wait.wq_entry);
+ ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA,
+ &alloc_ctx);
}

ext4_fc_free_inode_snap(inode);
@@ -1184,13 +1196,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)

alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- inodes[i] = igrab(&iter->vfs_inode);
- if (inodes[i])
- i++;
+ inodes[i++] = &iter->vfs_inode;
}

list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
struct ext4_inode_info *ei;
+ struct inode *inode;

if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
continue;
@@ -1200,12 +1211,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
/* See the comment in ext4_fc_commit_dentry_updates(). */
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
if (!list_empty(&ei->i_fc_list))
continue;

- inodes[i] = igrab(&ei->vfs_inode);
- if (inodes[i])
- i++;
+ /*
+ * Create-only inodes may only be referenced via fcd_dilist and
+ * not appear on s_fc_q[MAIN]. They may hit the last iput while
+ * we are snapshotting, but inode eviction calls ext4_fc_del(),
+ * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+ * so the inode stays pinned and the snapshot stays valid until
+ * ext4_fc_cleanup().
+ */
+ ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ inodes[i++] = inode;
}
ext4_fc_unlock(sb, alloc_ctx);

@@ -1215,10 +1234,6 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
break;
}

- for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
- if (inodes[nr_inodes])
- iput(inodes[nr_inodes]);
- }
kvfree(inodes);
return ret;
}
@@ -1234,8 +1249,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;
int alloc_ctx;
- int flushing_wait_bit =
- ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);

/*
* Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1261,8 +1274,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
- wake_up_bit(ext4_inode_state_wait_word(&iter->vfs_inode),
- flushing_wait_bit);
+ ext4_fc_wake_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
}

/*
@@ -1285,8 +1298,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
jbd2_journal_lock_updates(journal);
/*
* The journal is now locked. No more handles can start and all the
- * previous handles are now drained. We now mark the inodes on the
- * commit queue as being committed.
+ * previous handles are now drained. Snapshotting happens in this
+ * window so log writing can consume only stable snapshots without
+ * doing logical-to-physical mapping.
*/
alloc_ctx = ext4_fc_lock(sb);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1482,8 +1496,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
int alloc_ctx;
- int committing_wait_bit =
- ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);

if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
@@ -1521,8 +1533,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
* barrier in prepare_to_wait() in ext4_fc_del().
*/
smp_mb();
- wake_up_bit(ext4_inode_state_wait_word(&ei->vfs_inode),
- committing_wait_bit);
+ ext4_fc_wake_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
}

while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
@@ -1537,6 +1549,20 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
struct ext4_inode_info,
i_fc_dilist);
ext4_fc_free_inode_snap(&ei->vfs_inode);
+ spin_lock(&ei->i_fc_lock);
+ ext4_clear_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_REQUEUE);
+ ext4_clear_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ spin_unlock(&ei->i_fc_lock);
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_del().
+ */
+ smp_mb();
+ ext4_fc_wake_inode_state(&ei->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
}
list_del_init(&fc_dentry->fcd_dilist);

--
2.53.0