[PATCH 4.4 03/74] Btrfs: fix deadlock running delayed iputs at transaction commit time

From: Greg Kroah-Hartman
Date: Mon Mar 07 2016 - 19:24:16 EST


4.4-stable review patch. If anyone has any objections, please let me know.

------------------

From: Filipe Manana <fdmanana@xxxxxxxx>

commit c2d6cb1636d235257086f939a8194ef0bf93af6e upstream.

While running a stress test I ran into a deadlock when running the delayed
iputs at transaction time, which produced the following report and trace:

[ 886.399989] =============================================
[ 886.400871] [ INFO: possible recursive locking detected ]
[ 886.401663] 4.4.0-rc6-btrfs-next-18+ #1 Not tainted
[ 886.402384] ---------------------------------------------
[ 886.403182] fio/8277 is trying to acquire lock:
[ 886.403568] (&fs_info->delayed_iput_sem){++++..}, at: [<ffffffffa0538823>] btrfs_run_delayed_iputs+0x36/0xbf [btrfs]
[ 886.403568]
[ 886.403568] but task is already holding lock:
[ 886.403568] (&fs_info->delayed_iput_sem){++++..}, at: [<ffffffffa0538823>] btrfs_run_delayed_iputs+0x36/0xbf [btrfs]
[ 886.403568]
[ 886.403568] other info that might help us debug this:
[ 886.403568] Possible unsafe locking scenario:
[ 886.403568]
[ 886.403568] CPU0
[ 886.403568] ----
[ 886.403568] lock(&fs_info->delayed_iput_sem);
[ 886.403568] lock(&fs_info->delayed_iput_sem);
[ 886.403568]
[ 886.403568] *** DEADLOCK ***
[ 886.403568]
[ 886.403568] May be due to missing lock nesting notation
[ 886.403568]
[ 886.403568] 3 locks held by fio/8277:
[ 886.403568] #0: (sb_writers#11){.+.+.+}, at: [<ffffffff81174c4c>] __sb_start_write+0x5f/0xb0
[ 886.403568] #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa054620d>] btrfs_file_write_iter+0x73/0x408 [btrfs]
[ 886.403568] #2: (&fs_info->delayed_iput_sem){++++..}, at: [<ffffffffa0538823>] btrfs_run_delayed_iputs+0x36/0xbf [btrfs]
[ 886.403568]
[ 886.403568] stack backtrace:
[ 886.403568] CPU: 6 PID: 8277 Comm: fio Not tainted 4.4.0-rc6-btrfs-next-18+ #1
[ 886.403568] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 886.403568] 0000000000000000 ffff88009f80f770 ffffffff8125d4fd ffffffff82af1fc0
[ 886.403568] ffff88009f80f830 ffffffff8108e5f9 0000000200000000 ffff88009fd92290
[ 886.403568] 0000000000000000 ffffffff82af1fc0 ffffffff829cfb01 00042b216d008804
[ 886.403568] Call Trace:
[ 886.403568] [<ffffffff8125d4fd>] dump_stack+0x4e/0x79
[ 886.403568] [<ffffffff8108e5f9>] __lock_acquire+0xd42/0xf0b
[ 886.403568] [<ffffffff810c22db>] ? __module_address+0xdf/0x108
[ 886.403568] [<ffffffff8108eb77>] lock_acquire+0x10d/0x194
[ 886.403568] [<ffffffff8108eb77>] ? lock_acquire+0x10d/0x194
[ 886.403568] [<ffffffffa0538823>] ? btrfs_run_delayed_iputs+0x36/0xbf [btrfs]
[ 886.489542] [<ffffffff8148556b>] down_read+0x3e/0x4d
[ 886.489542] [<ffffffffa0538823>] ? btrfs_run_delayed_iputs+0x36/0xbf [btrfs]
[ 886.489542] [<ffffffffa0538823>] btrfs_run_delayed_iputs+0x36/0xbf [btrfs]
[ 886.489542] [<ffffffffa0533953>] btrfs_commit_transaction+0x8f5/0x96e [btrfs]
[ 886.489542] [<ffffffffa0521d7a>] flush_space+0x435/0x44a [btrfs]
[ 886.489542] [<ffffffffa052218b>] ? reserve_metadata_bytes+0x26a/0x384 [btrfs]
[ 886.489542] [<ffffffffa05221ae>] reserve_metadata_bytes+0x28d/0x384 [btrfs]
[ 886.489542] [<ffffffffa052256c>] ? btrfs_block_rsv_refill+0x58/0x96 [btrfs]
[ 886.489542] [<ffffffffa0522584>] btrfs_block_rsv_refill+0x70/0x96 [btrfs]
[ 886.489542] [<ffffffffa053d747>] btrfs_evict_inode+0x394/0x55a [btrfs]
[ 886.489542] [<ffffffff81188e31>] evict+0xa7/0x15c
[ 886.489542] [<ffffffff81189878>] iput+0x1d3/0x266
[ 886.489542] [<ffffffffa053887c>] btrfs_run_delayed_iputs+0x8f/0xbf [btrfs]
[ 886.489542] [<ffffffffa0533953>] btrfs_commit_transaction+0x8f5/0x96e [btrfs]
[ 886.489542] [<ffffffff81085096>] ? signal_pending_state+0x31/0x31
[ 886.489542] [<ffffffffa0521191>] btrfs_alloc_data_chunk_ondemand+0x1d7/0x288 [btrfs]
[ 886.489542] [<ffffffffa0521282>] btrfs_check_data_free_space+0x40/0x59 [btrfs]
[ 886.489542] [<ffffffffa05228f5>] btrfs_delalloc_reserve_space+0x1e/0x4e [btrfs]
[ 886.489542] [<ffffffffa053620a>] btrfs_direct_IO+0x10c/0x27e [btrfs]
[ 886.489542] [<ffffffff8111d9a1>] generic_file_direct_write+0xb3/0x128
[ 886.489542] [<ffffffffa05463c3>] btrfs_file_write_iter+0x229/0x408 [btrfs]
[ 886.489542] [<ffffffff8108ae38>] ? __lock_is_held+0x38/0x50
[ 886.489542] [<ffffffff8117279e>] __vfs_write+0x7c/0xa5
[ 886.489542] [<ffffffff81172cda>] vfs_write+0xa0/0xe4
[ 886.489542] [<ffffffff811734cc>] SyS_write+0x50/0x7e
[ 886.489542] [<ffffffff814872d7>] entry_SYSCALL_64_fastpath+0x12/0x6f
[ 1081.852335] INFO: task fio:8244 blocked for more than 120 seconds.
[ 1081.854348] Not tainted 4.4.0-rc6-btrfs-next-18+ #1
[ 1081.857560] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1081.863227] fio D ffff880213f9bb28 0 8244 8240 0x00000000
[ 1081.868719] ffff880213f9bb28 00ffffff810fc6b0 ffffffff0000000a ffff88023ed55240
[ 1081.872499] ffff880206b5d400 ffff880213f9c000 ffff88020a4d5318 ffff880206b5d400
[ 1081.876834] ffffffff00000001 ffff880206b5d400 ffff880213f9bb40 ffffffff81482ba4
[ 1081.880782] Call Trace:
[ 1081.881793] [<ffffffff81482ba4>] schedule+0x7f/0x97
[ 1081.883340] [<ffffffff81485eb5>] rwsem_down_write_failed+0x2d5/0x325
[ 1081.895525] [<ffffffff8108d48d>] ? trace_hardirqs_on_caller+0x16/0x1ab
[ 1081.897419] [<ffffffff81269723>] call_rwsem_down_write_failed+0x13/0x20
[ 1081.899251] [<ffffffff81269723>] ? call_rwsem_down_write_failed+0x13/0x20
[ 1081.901063] [<ffffffff81089fae>] ? __down_write_nested.isra.0+0x1f/0x21
[ 1081.902365] [<ffffffff814855bd>] down_write+0x43/0x57
[ 1081.903846] [<ffffffffa05211b0>] ? btrfs_alloc_data_chunk_ondemand+0x1f6/0x288 [btrfs]
[ 1081.906078] [<ffffffffa05211b0>] btrfs_alloc_data_chunk_ondemand+0x1f6/0x288 [btrfs]
[ 1081.908846] [<ffffffff8108d461>] ? mark_held_locks+0x56/0x6c
[ 1081.910409] [<ffffffffa0521282>] btrfs_check_data_free_space+0x40/0x59 [btrfs]
[ 1081.912482] [<ffffffffa05228f5>] btrfs_delalloc_reserve_space+0x1e/0x4e [btrfs]
[ 1081.914597] [<ffffffffa053620a>] btrfs_direct_IO+0x10c/0x27e [btrfs]
[ 1081.919037] [<ffffffff8111d9a1>] generic_file_direct_write+0xb3/0x128
[ 1081.920754] [<ffffffffa05463c3>] btrfs_file_write_iter+0x229/0x408 [btrfs]
[ 1081.922496] [<ffffffff8108ae38>] ? __lock_is_held+0x38/0x50
[ 1081.923922] [<ffffffff8117279e>] __vfs_write+0x7c/0xa5
[ 1081.925275] [<ffffffff81172cda>] vfs_write+0xa0/0xe4
[ 1081.926584] [<ffffffff811734cc>] SyS_write+0x50/0x7e
[ 1081.927968] [<ffffffff814872d7>] entry_SYSCALL_64_fastpath+0x12/0x6f
[ 1081.985293] INFO: lockdep is turned off.
[ 1081.986132] INFO: task fio:8249 blocked for more than 120 seconds.
[ 1081.987434] Not tainted 4.4.0-rc6-btrfs-next-18+ #1
[ 1081.988534] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1081.990147] fio D ffff880218febbb8 0 8249 8240 0x00000000
[ 1081.991626] ffff880218febbb8 00ffffff81486b8e ffff88020000000b ffff88023ed75240
[ 1081.993258] ffff8802120a9a00 ffff880218fec000 ffff88020a4d5318 ffff8802120a9a00
[ 1081.994850] ffffffff00000001 ffff8802120a9a00 ffff880218febbd0 ffffffff81482ba4
[ 1081.996485] Call Trace:
[ 1081.997037] [<ffffffff81482ba4>] schedule+0x7f/0x97
[ 1081.998017] [<ffffffff81485eb5>] rwsem_down_write_failed+0x2d5/0x325
[ 1081.999241] [<ffffffff810852a5>] ? finish_wait+0x6d/0x76
[ 1082.000306] [<ffffffff81269723>] call_rwsem_down_write_failed+0x13/0x20
[ 1082.001533] [<ffffffff81269723>] ? call_rwsem_down_write_failed+0x13/0x20
[ 1082.002776] [<ffffffff81089fae>] ? __down_write_nested.isra.0+0x1f/0x21
[ 1082.003995] [<ffffffff814855bd>] down_write+0x43/0x57
[ 1082.005000] [<ffffffffa05211b0>] ? btrfs_alloc_data_chunk_ondemand+0x1f6/0x288 [btrfs]
[ 1082.007403] [<ffffffffa05211b0>] btrfs_alloc_data_chunk_ondemand+0x1f6/0x288 [btrfs]
[ 1082.008988] [<ffffffffa0545064>] btrfs_fallocate+0x7c1/0xc2f [btrfs]
[ 1082.010193] [<ffffffff8108a1ba>] ? percpu_down_read+0x4e/0x77
[ 1082.011280] [<ffffffff81174c4c>] ? __sb_start_write+0x5f/0xb0
[ 1082.012265] [<ffffffff81174c4c>] ? __sb_start_write+0x5f/0xb0
[ 1082.013021] [<ffffffff811712e4>] vfs_fallocate+0x170/0x1ff
[ 1082.013738] [<ffffffff81181ebb>] ioctl_preallocate+0x89/0x9b
[ 1082.014778] [<ffffffff811822d7>] do_vfs_ioctl+0x40a/0x4ea
[ 1082.015778] [<ffffffff81176ea7>] ? SYSC_newfstat+0x25/0x2e
[ 1082.016806] [<ffffffff8118b4de>] ? __fget_light+0x4d/0x71
[ 1082.017789] [<ffffffff8118240e>] SyS_ioctl+0x57/0x79
[ 1082.018706] [<ffffffff814872d7>] entry_SYSCALL_64_fastpath+0x12/0x6f

This happens because we can recursively acquire the semaphore
fs_info->delayed_iput_sem when attempting to allocate space to satisfy
a file write request as shown in the first trace above - when committing
a transaction we acquire (down_read) the semaphore before running the
delayed iputs, and when running a delayed iput() we can end up calling
an inode's eviction handler, which in turn commits another transaction
and attempts to acquire (down_read) again the semaphore to run more
delayed iput operations.
This results in a deadlock because if a task acquires multiple times a
semaphore it should invoke down_read_nested() with a different lockdep
class for each level of recursion.

Fix this by simplifying the implementation and use a mutex instead that
is acquired by the cleaner kthread before it runs the delayed iputs
instead of always acquiring a semaphore before delayed references are
run from anywhere.

Fixes: d7c151717a1e (btrfs: Fix NO_SPACE bug caused by delayed-iput)
Signed-off-by: Filipe Manana <fdmanana@xxxxxxxx>
Signed-off-by: Chris Mason <clm@xxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

---
fs/btrfs/ctree.h | 2 +-
fs/btrfs/disk-io.c | 5 ++++-
fs/btrfs/extent-tree.c | 9 +++++----
fs/btrfs/inode.c | 4 ----
4 files changed, 10 insertions(+), 10 deletions(-)

--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1572,7 +1572,7 @@ struct btrfs_fs_info {

spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct rw_semaphore delayed_iput_sem;
+ struct mutex cleaner_delayed_iput_mutex;

/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1796,7 +1796,10 @@ static int cleaner_kthread(void *arg)
goto sleep;
}

+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);

@@ -2556,8 +2559,8 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
- init_rwsem(&fs_info->delayed_iput_sem);

INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4100,11 +4100,12 @@ commit_trans:
if (ret)
return ret;
/*
- * make sure that all running delayed iput are
- * done
+ * The cleaner kthread might still be doing iput
+ * operations. Wait for it to finish so that
+ * more space is released.
*/
- down_write(&root->fs_info->delayed_iput_sem);
- up_write(&root->fs_info->delayed_iput_sem);
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
goto again;
} else {
btrfs_end_transaction(trans, root);
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3142,8 +3142,6 @@ void btrfs_run_delayed_iputs(struct btrf
if (empty)
return;

- down_read(&fs_info->delayed_iput_sem);
-
spin_lock(&fs_info->delayed_iput_lock);
list_splice_init(&fs_info->delayed_iputs, &list);
spin_unlock(&fs_info->delayed_iput_lock);
@@ -3154,8 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrf
iput(delayed->inode);
kfree(delayed);
}
-
- up_read(&root->fs_info->delayed_iput_sem);
}

/*