[PATCH 33/52] fuse, dax: Take ->i_mmap_sem lock during dax page fault
From: Vivek Goyal
Date: Mon Dec 10 2018 - 12:15:14 EST
We need some kind of locking mechanism here. Normal file systems like
ext4 and xfs seems to take their own semaphore to protect agains
truncate while fault is going on.
We have additional requirement to protect against fuse dax memory range
reclaim. When a range has been selected for reclaim, we need to make sure
no other read/write/fault can try to access that memory range while
reclaim is in progress. Once reclaim is complete, lock will be released
and read/write/fault will trigger allocation of fresh dax range.
Taking inode_lock() is not an option in fault path as lockdep complains
about circular dependencies. So define a new fuse_inode->i_mmap_sem.
Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx>
---
fs/fuse/dir.c | 2 ++
fs/fuse/file.c | 17 +++++++++++++----
fs/fuse/fuse_i.h | 7 +++++++
fs/fuse/inode.c | 1 +
4 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7e6e421f6bb..8aa4ff82ea7a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1553,8 +1553,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
*/
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ down_write(&fi->i_mmap_sem);
truncate_pagecache(inode, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
+ up_write(&fi->i_mmap_sem);
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index eb12776f5ff6..73068289f62e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2523,13 +2523,20 @@ static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
if (write)
sb_start_pagefault(sb);
- /* TODO inode semaphore to protect faults vs truncate */
-
+ /*
+ * We need to serialize against not only truncate but also against
+ * fuse dax memory range reclaim. While a range is being reclaimed,
+ * we do not want any read/write/mmap to make progress and try
+ * to populate page cache or access memory we are trying to free.
+ */
+ down_read(&get_fuse_inode(inode)->i_mmap_sem);
ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &fuse_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+
if (write)
sb_end_pagefault(sb);
@@ -3476,9 +3483,11 @@ static long __fuse_file_fallocate(struct file *file, int mode,
file_update_time(file);
}
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ down_write(&fi->i_mmap_sem);
truncate_pagecache_range(inode, offset, offset + length - 1);
-
+ down_write(&fi->i_mmap_sem);
+ }
fuse_invalidate_attr(inode);
out:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e32b0059493b..280f717deb57 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -211,6 +211,13 @@ struct fuse_inode {
*/
struct rw_semaphore i_dmap_sem;
+ /**
+ * Can't take inode lock in fault path (leads to circular dependency).
+ * So take this in fuse dax fault path to make sure truncate and
+ * punch hole etc. can't make progress in parallel.
+ */
+ struct rw_semaphore i_mmap_sem;
+
/** Sorted rb tree of struct fuse_dax_mapping elements */
struct rb_root_cached dmap_tree;
unsigned long nr_dmaps;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 234b9c0c80ab..59fc5a7a18fc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,6 +85,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->state = 0;
fi->nr_dmaps = 0;
mutex_init(&fi->mutex);
+ init_rwsem(&fi->i_mmap_sem);
init_rwsem(&fi->i_dmap_sem);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
--
2.13.6