[PATCH v2 1/2] f2fs: dax: fix races between page faults and truncating pages
From: sunqiuyang
Date: Wed May 17 2017 - 23:04:41 EST
From: Qiuyang Sun <sunqiuyang@xxxxxxxxxx>
Currently in F2FS, page faults and operations that truncate the pagecahe
or data blocks, are completely unsynchronized. This can result in page
fault faulting in a page into a range that we are changing after
truncating, and thus we can end up with a page mapped to disk blocks that
will be shortly freed. Filesystem corruption will shortly follow.
This patch fixes the problem by creating new rw semaphore i_mmap_sem in
f2fs_inode_info and grab it for functions removing blocks from extent tree
and for read over page faults. The mechanism is similar to that in ext4.
Signed-off-by: Qiuyang Sun <sunqiuyang@xxxxxxxxxx>
---
Changelog v1 -> v2:
- Apply the new rw semaphore in some other necessary scenarios:
f2fs_write_failed
f2fs_filemap_fault (new function)
f2fs_vm_page_mkwrite
f2fs_setattr
(f2fs_add_inline_entries() does not need this rw semaphore, as dir is a
directory file and its pages would not be mmap'ed.)
- Lock coverage in the scenarios below are reconsidered:
punch_hole
f2fs_collapse_range
f2fs_zero_range
f2fs_insert_range
The v2 patches are at 4.12-rc1.
---
fs/f2fs/data.c | 2 ++
fs/f2fs/f2fs.h | 1 +
fs/f2fs/file.c | 48 +++++++++++++++++++++++++++++++++++++++---------
fs/f2fs/super.c | 1 +
4 files changed, 43 insertions(+), 9 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7c0f6bd..c9a3fbd 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1753,8 +1753,10 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
loff_t i_size = i_size_read(inode);
if (to > i_size) {
+ down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
truncate_blocks(inode, i_size, true);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
}
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2185c7a..8095f4f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -519,6 +519,7 @@ struct f2fs_inode_info {
struct mutex inmem_lock; /* lock for inmemory pages */
struct extent_tree *extent_tree; /* cached extent_tree entry */
struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
+ struct rw_semaphore i_mmap_sem;
};
static inline void get_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 61af721..0b0115c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,6 +33,18 @@
#include "trace.h"
#include <trace/events/f2fs.h>
+static int f2fs_filemap_fault(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ int err;
+
+ down_read(&F2FS_I(inode)->i_mmap_sem);
+ err = filemap_fault(vmf);
+ up_read(&F2FS_I(inode)->i_mmap_sem);
+
+ return err;
+}
+
static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
@@ -59,13 +71,14 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_balance_fs(sbi, dn.node_changed);
file_update_time(vmf->vma->vm_file);
+ down_read(&F2FS_I(inode)->i_mmap_sem);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
!PageUptodate(page))) {
unlock_page(page);
err = -EFAULT;
- goto out;
+ goto out_sem;
}
/*
@@ -94,6 +107,8 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+out_sem:
+ up_read(&F2FS_I(inode)->i_mmap_sem);
out:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
@@ -101,7 +116,7 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = f2fs_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = f2fs_vm_page_mkwrite,
};
@@ -687,8 +702,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
return -EACCES;
if (attr->ia_size <= i_size_read(inode)) {
+ down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_setsize(inode, attr->ia_size);
err = f2fs_truncate(inode);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
if (err)
return err;
} else {
@@ -696,7 +713,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
* do not trim all blocks after i_size if target size is
* larger than i_size.
*/
+ down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_setsize(inode, attr->ia_size);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
/* should convert inline inode here */
if (!f2fs_may_inline_data(inode)) {
@@ -839,12 +858,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_start = (loff_t)pg_start << PAGE_SHIFT;
blk_end = (loff_t)pg_end << PAGE_SHIFT;
+ down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
f2fs_lock_op(sbi);
ret = truncate_hole(inode, pg_start, pg_end);
f2fs_unlock_op(sbi);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
}
}
@@ -1083,16 +1104,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
pg_start = offset >> PAGE_SHIFT;
pg_end = (offset + len) >> PAGE_SHIFT;
+ down_write(&F2FS_I(inode)->i_mmap_sem);
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
if (ret)
- return ret;
+ goto out;
truncate_pagecache(inode, offset);
ret = f2fs_do_collapse(inode, pg_start, pg_end);
if (ret)
- return ret;
+ goto out;
/* write out all moved pages, if possible */
filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1105,6 +1127,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (!ret)
f2fs_i_size_write(inode, new_size);
+out:
+ up_write(&F2FS_I(inode)->i_mmap_sem);
return ret;
}
@@ -1169,9 +1193,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
+ down_write(&F2FS_I(inode)->i_mmap_sem);
ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
if (ret)
- return ret;
+ goto out_sem;
truncate_pagecache_range(inode, offset, offset + len - 1);
@@ -1185,7 +1210,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = fill_zero(inode, pg_start, off_start,
off_end - off_start);
if (ret)
- return ret;
+ goto out_sem;
new_size = max_t(loff_t, new_size, offset + len);
} else {
@@ -1193,7 +1218,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = fill_zero(inode, pg_start++, off_start,
PAGE_SIZE - off_start);
if (ret)
- return ret;
+ goto out_sem;
new_size = max_t(loff_t, new_size,
(loff_t)pg_start << PAGE_SHIFT);
@@ -1242,6 +1267,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
out:
if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
f2fs_i_size_write(inode, new_size);
+out_sem:
+ up_write(&F2FS_I(inode)->i_mmap_sem);
return ret;
}
@@ -1271,14 +1298,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
ret = truncate_blocks(inode, i_size_read(inode), true);
if (ret)
- return ret;
+ goto out;
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
if (ret)
- return ret;
+ goto out;
truncate_pagecache(inode, offset);
@@ -1307,6 +1335,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (!ret)
f2fs_i_size_write(inode, new_size);
+out:
+ up_write(&F2FS_I(inode)->i_mmap_sem);
return ret;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 83355ec..8472c98 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -624,6 +624,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
mutex_init(&fi->inmem_lock);
init_rwsem(&fi->dio_rwsem[READ]);
init_rwsem(&fi->dio_rwsem[WRITE]);
+ init_rwsem(&fi->i_mmap_sem);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
--
1.8.3.1