[PATCH 4/5] dax: fix PMD handling for fsync/msync

From: Ross Zwisler
Date: Wed Jan 20 2016 - 22:38:00 EST


Fix the way that DAX PMD radix tree entries are handled. With this patch
we now check to see if a PMD entry exists in the radix tree on write, even
if we are just trying to insert a PTE. If it exists, we dirty that instead
of inserting our own PTE entry.

Fix a bug in the PMD path in dax_writeback_mapping_range() where we were
previously passing a loff_t into radix_tree_lookup instead of a pgoff_t.

Account for the fact that multiple fsync/msync operations may be happening
at the same time and don't flush entries that are beyond end_index.

Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
---
fs/dax.c | 39 +++++++++++++++++++++++++++------------
1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 55ae394..3b03580 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -327,19 +327,27 @@ static int copy_user_bh(struct page *to, struct inode *inode,
}

#define NO_SECTOR -1
+#define PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))

static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
sector_t sector, bool pmd_entry, bool dirty)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
+ pgoff_t pmd_index = PMD_INDEX(index);
int type, error = 0;
void *entry;

__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

spin_lock_irq(&mapping->tree_lock);
- entry = radix_tree_lookup(page_tree, index);

+ entry = radix_tree_lookup(page_tree, pmd_index);
+ if (RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+ index = pmd_index;
+ goto dirty;
+ }
+
+ entry = radix_tree_lookup(page_tree, index);
if (entry) {
type = RADIX_DAX_TYPE(entry);
if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
@@ -460,31 +468,33 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
{
struct inode *inode = mapping->host;
struct block_device *bdev = inode->i_sb->s_bdev;
+ pgoff_t start_index, end_index, pmd_index;
pgoff_t indices[PAGEVEC_SIZE];
- pgoff_t start_page, end_page;
struct pagevec pvec;
- void *entry;
+ bool done = false;
int i, ret = 0;
+ void *entry;

if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;

+ start_index = start >> PAGE_CACHE_SHIFT;
+ end_index = end >> PAGE_CACHE_SHIFT;
+ pmd_index = PMD_INDEX(start_index);
+
rcu_read_lock();
- entry = radix_tree_lookup(&mapping->page_tree, start & PMD_MASK);
+ entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
rcu_read_unlock();

/* see if the start of our range is covered by a PMD entry */
- if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
- start &= PMD_MASK;
-
- start_page = start >> PAGE_CACHE_SHIFT;
- end_page = end >> PAGE_CACHE_SHIFT;
+ if (RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+ start_index = pmd_index;

- tag_pages_for_writeback(mapping, start_page, end_page);
+ tag_pages_for_writeback(mapping, start_index, end_index);

pagevec_init(&pvec, 0);
- while (1) {
- pvec.nr = find_get_entries_tag(mapping, start_page,
+ while (!done) {
+ pvec.nr = find_get_entries_tag(mapping, start_index,
PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
pvec.pages, indices);

@@ -492,6 +502,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
break;

for (i = 0; i < pvec.nr; i++) {
+ if (indices[i] > end_index) {
+ done = true;
+ break;
+ }
+
ret = dax_writeback_one(bdev, mapping, indices[i],
pvec.pages[i]);
if (ret < 0)
--
2.5.0