[PATCH AUTOSEL 5.4 118/134] cifs: move cifsFileInfo_put logic into a work-queue

From: Sasha Levin
Date: Wed Dec 11 2019 - 10:15:10 EST


From: Ronnie Sahlberg <lsahlber@xxxxxxxxxx>

[ Upstream commit 32546a9586aa4565035bb557e191648e022b29e8 ]

This patch moves the final part of the cifsFileInfo_put() logic where we
need a write lock on lock_sem to be processed in a separate thread that
holds no other locks.
This is to prevent deadlocks like the one below:

> there are 6 processes looping to while trying to down_write
> cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from
> cifs_new_fileinfo
>
> and there are 5 other processes which are blocked, several of them
> waiting on either PG_writeback or PG_locked (which are both set), all
> for the same page of the file
>
> 2 inode_lock() (inode->i_rwsem) for the file
> 1 wait_on_page_writeback() for the page
> 1 down_read(inode->i_rwsem) for the inode of the directory
> 1 inode_lock()(inode->i_rwsem) for the inode of the directory
> 1 __lock_page
>
>
> so processes are blocked waiting on:
> page flags PG_locked and PG_writeback for one specific page
> inode->i_rwsem for the directory
> inode->i_rwsem for the file
> cifsInodeInflock_sem
>
>
>
> here are the more gory details (let me know if I need to provide
> anything more/better):
>
> [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3
> COMMAND: "reopen_file"
> #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095
> #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df
> #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7
> #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d
> #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d
> #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33
> #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6
> #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315
> * (I think legitimize_path is bogus)
>
> in path_openat
> } else {
> const char *s = path_init(nd, flags);
> while (!(error = link_path_walk(s, nd)) &&
> (error = do_last(nd, file, op)) > 0) { <<<<
>
> do_last:
> if (open_flag & O_CREAT)
> inode_lock(dir->d_inode); <<<<
> else
> so it's trying to take inode->i_rwsem for the directory
>
> DENTRY INODE SUPERBLK TYPE PATH
> ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/
> inode.i_rwsem is ffff8c691158efc0
>
> <struct rw_semaphore 0xffff8c691158efc0>:
> owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 -
> reopen_file), counter: 0x0000000000000003
> waitlist: 2
> 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926
> RWSEM_WAITING_FOR_WRITE
> 0xffff996500393e00 9802 ls UN 0 1:17:26.700
> RWSEM_WAITING_FOR_READ
>
>
> the owner of the inode.i_rwsem of the directory is:
>
> [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3
> COMMAND: "reopen_file"
> #0 [ffff99650065b828] __schedule at ffffffff9b6e6095
> #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df
> #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89
> #3 [ffff99650065b940] msleep at ffffffff9af573a9
> #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs]
> #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs]
> #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs]
> #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd
> #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs]
> #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs]
> #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs]
> #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7
> #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33
> #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6
> #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315
>
> cifs_launder_page is for page 0xffffd1e2c07d2480
>
> crash> page.index,mapping,flags 0xffffd1e2c07d2480
> index = 0x8
> mapping = 0xffff8c68f3cd0db0
> flags = 0xfffffc0008095
>
> PAGE-FLAG BIT VALUE
> PG_locked 0 0000001
> PG_uptodate 2 0000004
> PG_lru 4 0000010
> PG_waiters 7 0000080
> PG_writeback 15 0008000
>
>
> inode is ffff8c68f3cd0c40
> inode.i_rwsem is ffff8c68f3cd0ce0
> DENTRY INODE SUPERBLK TYPE PATH
> ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG
> /mnt/vm1_smb/testfile.8853
>
>
> this process holds the inode->i_rwsem for the parent directory, is
> laundering a page attached to the inode of the file it's opening, and in
> _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem
> for the file itself.
>
>
> <struct rw_semaphore 0xffff8c68f3cd0ce0>:
> owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 -
> reopen_file), counter: 0x0000000000000003
> waitlist: 1
> 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912
> RWSEM_WAITING_FOR_WRITE
>
> this is the inode.i_rwsem for the file
>
> the owner:
>
> [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2
> COMMAND: "reopen_file"
> #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095
> #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df
> #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2
> #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f
> #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf
> #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c
> #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs]
> #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4
> #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a
> #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs]
> #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd
> #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35
> #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9
> #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315
>
> the process holds the inode->i_rwsem for the file to which it's writing,
> and is trying to __lock_page for the same page as in the other processes
>
>
> the other tasks:
> [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2
> COMMAND: "reopen_file"
> #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095
> #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df
> #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89
> #3 [ffff9965007b3af0] msleep at ffffffff9af573a9
> #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs]
> #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs]
> #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a
> #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f
> #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33
> #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6
> #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315
>
> this is opening the file, and is trying to down_write cinode->lock_sem
>
>
> [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2
> COMMAND: "reopen_file"
> [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3
> COMMAND: "reopen_file"
> [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2
> COMMAND: "reopen_file"
> [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6
> COMMAND: "reopen_file"
> #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095
> #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df
> #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89
> #3 [ffff9965007c3d90] msleep at ffffffff9af573a9
> #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs]
> #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs]
> #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e
> #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614
> #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f
> #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c
>
> closing the file, and trying to down_write cifsi->lock_sem
>
>
> [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7
> COMMAND: "reopen_file"
> #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095
> #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df
> #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2
> #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6
> #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028
> #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165
> #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs]
> #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1
> #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e
> #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315
>
> in __filemap_fdatawait_range
> wait_on_page_writeback(page);
> for the same page of the file
>
>
>
> [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7
> COMMAND: "reopen_file"
> #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095
> #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df
> #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7
> #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs]
> #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd
> #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35
> #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9
> #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315
>
> inode_lock(inode);
>
>
> and one 'ls' later on, to see whether the rest of the mount is available
> (the test file is in the root, so we get blocked up on the directory
> ->i_rwsem), so the entire mount is unavailable
>
> [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4
> COMMAND: "ls"
> #0 [ffff996500393d28] __schedule at ffffffff9b6e6095
> #1 [ffff996500393db8] schedule at ffffffff9b6e64df
> #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421
> #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2
> #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56
> #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c
> #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6
> #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315
>
> in iterate_dir:
> if (shared)
> res = down_read_killable(&inode->i_rwsem); <<<<
> else
> res = down_write_killable(&inode->i_rwsem);
>

Reported-by: Frank Sorenson <sorenson@xxxxxxxxxx>
Reviewed-by: Pavel Shilovsky <pshilov@xxxxxxxxxxxxx>
Signed-off-by: Ronnie Sahlberg <lsahlber@xxxxxxxxxx>
Signed-off-by: Steve French <stfrench@xxxxxxxxxxxxx>
Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
---
fs/cifs/cifsfs.c | 13 +++++++-
fs/cifs/cifsglob.h | 5 +++-
fs/cifs/file.c | 74 ++++++++++++++++++++++++++++++----------------
3 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1a135d1b85bd8..07d8ace61f777 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp;

struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
+struct workqueue_struct *fileinfo_put_wq;
struct workqueue_struct *cifsoplockd_wq;
__u32 cifs_lock_secret;

@@ -1554,11 +1555,18 @@ init_cifs(void)
goto out_destroy_cifsiod_wq;
}

+ fileinfo_put_wq = alloc_workqueue("cifsfileinfoput",
+ WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!fileinfo_put_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_decrypt_wq;
+ }
+
cifsoplockd_wq = alloc_workqueue("cifsoplockd",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!cifsoplockd_wq) {
rc = -ENOMEM;
- goto out_destroy_decrypt_wq;
+ goto out_destroy_fileinfo_put_wq;
}

rc = cifs_fscache_register();
@@ -1624,6 +1632,8 @@ out_unreg_fscache:
cifs_fscache_unregister();
out_destroy_cifsoplockd_wq:
destroy_workqueue(cifsoplockd_wq);
+out_destroy_fileinfo_put_wq:
+ destroy_workqueue(fileinfo_put_wq);
out_destroy_decrypt_wq:
destroy_workqueue(decrypt_wq);
out_destroy_cifsiod_wq:
@@ -1653,6 +1663,7 @@ exit_cifs(void)
cifs_fscache_unregister();
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
+ destroy_workqueue(fileinfo_put_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d78bfcc191560..e0a7dc3f62b29 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1265,6 +1265,7 @@ struct cifsFileInfo {
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
struct work_struct oplock_break; /* work for oplock breaks */
+ struct work_struct put; /* work for the final part of _put */
};

struct cifs_io_parms {
@@ -1370,7 +1371,8 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
}

struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
-void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr,
+ bool offload);
void cifsFileInfo_put(struct cifsFileInfo *cifs_file);

#define CIFS_CACHE_READ_FLG 1
@@ -1907,6 +1909,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
extern struct workqueue_struct *decrypt_wq;
+extern struct workqueue_struct *fileinfo_put_wq;
extern struct workqueue_struct *cifsoplockd_wq;
extern __u32 cifs_lock_secret;

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa7b0fa72bb36..a64978a4459d7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -288,6 +288,8 @@ cifs_down_write(struct rw_semaphore *sem)
msleep(10);
}

+static void cifsFileInfo_put_work(struct work_struct *work);
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -325,6 +327,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->invalidHandle = false;
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
+ INIT_WORK(&cfile->put, cifsFileInfo_put_work);
mutex_init(&cfile->fh_mutex);
spin_lock_init(&cfile->file_info_lock);

@@ -375,6 +378,41 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
return cifs_file;
}

+static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
+{
+ struct inode *inode = d_inode(cifs_file->dentry);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct cifsLockInfo *li, *tmp;
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * Delete any outstanding lock records. We'll lose them when the file
+ * is closed anyway.
+ */
+ cifs_down_write(&cifsi->lock_sem);
+ list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
+ list_del(&li->llist);
+ cifs_del_lock_waiters(li);
+ kfree(li);
+ }
+ list_del(&cifs_file->llist->llist);
+ kfree(cifs_file->llist);
+ up_write(&cifsi->lock_sem);
+
+ cifs_put_tlink(cifs_file->tlink);
+ dput(cifs_file->dentry);
+ cifs_sb_deactive(sb);
+ kfree(cifs_file);
+}
+
+static void cifsFileInfo_put_work(struct work_struct *work)
+{
+ struct cifsFileInfo *cifs_file = container_of(work,
+ struct cifsFileInfo, put);
+
+ cifsFileInfo_put_final(cifs_file);
+}
+
/**
* cifsFileInfo_put - release a reference of file priv data
*
@@ -382,15 +420,15 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
- _cifsFileInfo_put(cifs_file, true);
+ _cifsFileInfo_put(cifs_file, true, true);
}

/**
* _cifsFileInfo_put - release a reference of file priv data
*
* This may involve closing the filehandle @cifs_file out on the
- * server. Must be called without holding tcon->open_file_lock and
- * cifs_file->file_info_lock.
+ * server. Must be called without holding tcon->open_file_lock,
+ * cinode->open_file_lock and cifs_file->file_info_lock.
*
* If @wait_for_oplock_handler is true and we are releasing the last
* reference, wait for any running oplock break handler of the file
@@ -398,7 +436,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
* oplock break handler, you need to pass false.
*
*/
-void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
+ bool wait_oplock_handler, bool offload)
{
struct inode *inode = d_inode(cifs_file->dentry);
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
@@ -406,7 +445,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct super_block *sb = inode->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifsLockInfo *li, *tmp;
struct cifs_fid fid;
struct cifs_pending_open open;
bool oplock_break_cancelled;
@@ -467,24 +505,10 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)

cifs_del_pending_open(&open);

- /*
- * Delete any outstanding lock records. We'll lose them when the file
- * is closed anyway.
- */
- cifs_down_write(&cifsi->lock_sem);
- list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
- list_del(&li->llist);
- cifs_del_lock_waiters(li);
- kfree(li);
- }
- list_del(&cifs_file->llist->llist);
- kfree(cifs_file->llist);
- up_write(&cifsi->lock_sem);
-
- cifs_put_tlink(cifs_file->tlink);
- dput(cifs_file->dentry);
- cifs_sb_deactive(sb);
- kfree(cifs_file);
+ if (offload)
+ queue_work(fileinfo_put_wq, &cifs_file->put);
+ else
+ cifsFileInfo_put_final(cifs_file);
}

int cifs_open(struct inode *inode, struct file *file)
@@ -808,7 +832,7 @@ reopen_error_exit:
int cifs_close(struct inode *inode, struct file *file)
{
if (file->private_data != NULL) {
- cifsFileInfo_put(file->private_data);
+ _cifsFileInfo_put(file->private_data, true, false);
file->private_data = NULL;
}

@@ -4680,7 +4704,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
- _cifsFileInfo_put(cfile, false /* do not wait for ourself */);
+ _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}

--
2.20.1