[PATCH] xfs: avoid inodegc worker flush deadlock

From: ZhengYuan Huang

Date: Sat Mar 28 2026 - 03:13:13 EST


[BUG]
WARNING: possible recursive locking detected
--------------------------------------------
kworker/0:1/10 is trying to acquire lock:
ffff88801621fd48 ((wq_completion)xfs-inodegc/ublkb1){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x99/0x1c0 kernel/workqueue.c:3936

but task is already holding lock:
ffff88801621fd48 ((wq_completion)xfs-inodegc/ublkb1){+.+.}-{0:0}, at: process_one_work+0x1188/0x1980 kernel/workqueue.c:3238

other info that might help us debug this:
Possible unsafe locking scenario:

CPU0
----
lock((wq_completion)xfs-inodegc/ublkb1);
lock((wq_completion)xfs-inodegc/ublkb1);

*** DEADLOCK ***

May be due to missing lock nesting notation

2 locks held by kworker/0:1/10:
#0: ffff88801621fd48 ((wq_completion)xfs-inodegc/ublkb1){+.+.}-{0:0}, at: process_one_work+0x1188/0x1980 kernel/workqueue.c:3238
#1: ffff888009dafce8 ((work_completion)(&(&gc->work)->work)){+.+.}-{0:0}, at: process_one_work+0x865/0x1980 kernel/workqueue.c:3239

stack backtrace:
Workqueue: xfs-inodegc/ublkb1 xfs_inodegc_worker
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:94 [inline]
dump_stack_lvl+0xbe/0x130 lib/dump_stack.c:120
dump_stack+0x15/0x20 lib/dump_stack.c:129
print_deadlock_bug+0x23f/0x320 kernel/locking/lockdep.c:3041
check_deadlock kernel/locking/lockdep.c:3093 [inline]
validate_chain kernel/locking/lockdep.c:3895 [inline]
__lock_acquire+0x1317/0x21e0 kernel/locking/lockdep.c:5237
lock_acquire kernel/locking/lockdep.c:5868 [inline]
lock_acquire+0x169/0x2f0 kernel/locking/lockdep.c:5825
touch_wq_lockdep_map+0xab/0x1c0 kernel/workqueue.c:3936
__flush_workqueue+0x117/0x1010 kernel/workqueue.c:3978
xfs_inodegc_wait_all fs/xfs/xfs_icache.c:495 [inline]
xfs_inodegc_flush+0x9a/0x390 fs/xfs/xfs_icache.c:2020
xfs_blockgc_flush_all+0x106/0x250 fs/xfs/xfs_icache.c:1614
xfs_trans_alloc+0x5e4/0xc10 fs/xfs/xfs_trans.c:268
xfs_inactive_ifree+0x329/0x3c0 fs/xfs/xfs_inode.c:1224
xfs_inactive+0x590/0xb60 fs/xfs/xfs_inode.c:1485
xfs_inodegc_inactivate fs/xfs/xfs_icache.c:1942 [inline]
xfs_inodegc_worker+0x241/0x650 fs/xfs/xfs_icache.c:1988
process_one_work+0x8e0/0x1980 kernel/workqueue.c:3263
process_scheduled_works kernel/workqueue.c:3346 [inline]
worker_thread+0x683/0xf80 kernel/workqueue.c:3427
kthread+0x3f0/0x850 kernel/kthread.c:463
ret_from_fork+0x50f/0x610 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>

[CAUSE]

If xfs_trans_alloc() hits -ENOSPC while xfs_inodegc_worker() is
inactivating an unlinked inode, the retry path runs
xfs_blockgc_flush_all() and recurses into xfs_inodegc_flush().
xfs_inodegc_wait_all() then calls flush_workqueue() on m_inodegc_wq
from an inodegc worker, which waits for the current in-flight work
item and deadlocks.

[FIX]

Detect when xfs_inodegc_wait_all() is running from an inodegc worker
and flush every other per-cpu inodegc work item directly instead of
flushing the whole workqueue. This preserves the intent of waiting
for background inodegc reclaim while avoiding recursion on the current
worker. Also collect inodegc errors from all possible CPUs because
running workers clear their cpumask bit before processing inodes.

Fixes: d4d12c02bf5f ("xfs: collect errors from inodegc for unlinked inode recovery")
Signed-off-by: ZhengYuan Huang <gality369@xxxxxxxxx>
---
fs/xfs/xfs_icache.c | 50 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e44040206851..cdb707332b4b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -484,16 +484,64 @@ xfs_inodegc_queue_all(
return ret;
}

+/*
+ * flush_workqueue() waits for all in-flight work items, including the current
+ * one. If xfs_trans_alloc() hits ENOSPC while an inodegc worker is freeing an
+ * unlinked inode, xfs_blockgc_flush_all() recurses into xfs_inodegc_flush().
+ * Waiting for the current worker there deadlocks because the flush cannot
+ * complete until this work function returns.
+ */
+static struct xfs_inodegc *
+xfs_inodegc_current(struct xfs_mount *mp)
+{
+ struct work_struct *work = current_work();
+ int cpu;
+
+ if (!work)
+ return NULL;
+
+ for_each_possible_cpu(cpu) {
+ struct xfs_inodegc *gc = per_cpu_ptr(mp->m_inodegc, cpu);
+
+ if (work == &gc->work.work)
+ return gc;
+ }
+
+ return NULL;
+}
+
/* Wait for all queued work and collect errors */
static int
xfs_inodegc_wait_all(
struct xfs_mount *mp)
{
+ struct xfs_inodegc *current_gc = xfs_inodegc_current(mp);
int cpu;
int error = 0;

+ if (current_gc) {
+ /*
+ * current_gc is already in flight, so waiting for the whole
+ * workqueue would recurse on ourselves. Flush every other
+ * per-cpu work item instead so that ENOSPC retries still wait
+ * for the rest of the inodegc work to finish.
+ */
+ for_each_possible_cpu(cpu) {
+ struct xfs_inodegc *gc;
+
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (gc == current_gc)
+ continue;
+ flush_delayed_work(&gc->work);
+ if (gc->error && !error)
+ error = gc->error;
+ gc->error = 0;
+ }
+ return error;
+ }
+
flush_workqueue(mp->m_inodegc_wq);
- for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
+ for_each_possible_cpu(cpu) {
struct xfs_inodegc *gc;

gc = per_cpu_ptr(mp->m_inodegc, cpu);
--
2.43.0