[PATCH 3/3] dax: Handle write faults more efficiently
From: Matthew Wilcox
Date: Mon Jan 25 2016 - 12:25:50 EST
From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx>
When we handle a write-fault on a DAX mapping, we currently insert a
read-only mapping and then take the page fault again to convert it to
a writable mapping. This is necessary for the case where we cover a
hole with a read-only zero page, but when we have a data block already
allocated, it is inefficient.
Use the recently added vmf_insert_pfn_prot() to insert a writable mapping,
even though the default VM flags say to use a read-only mapping.
Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx>
---
fs/dax.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 53 insertions(+), 20 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 206650f..3f6138d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -519,9 +519,44 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+/*
+ * The default page protections for DAX VMAs are set to "copy" so that
+ * we get notifications when zero pages are written to. This function
+ * is called when we're inserting a mapping to a data page. If this is
+ * a write fault, we've already done all the necessary accounting and
+ * it's pointless to insert this translation entry read-only. Convert
+ * the pgprot to be writable.
+ *
+ * While this is not the most elegant code, the compiler can see that (on
+ * any sane architecture) all four arms of the conditional are the same.
+ */
+static pgprot_t dax_pgprot(struct vm_area_struct *vma, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ if (!write)
+ return pgprot;
+ if ((vma->vm_flags & (VM_READ|VM_EXEC)) == (VM_READ|VM_EXEC))
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P111) ^
+ pgprot_val(__S111));
+ else if ((vma->vm_flags & (VM_READ|VM_EXEC)) == VM_READ)
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P110) ^
+ pgprot_val(__S110));
+ else if ((vma->vm_flags & (VM_READ|VM_EXEC)) == VM_EXEC)
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P011) ^
+ pgprot_val(__S011));
+ else
+ return __pgprot(pgprot_val(pgprot) ^
+ pgprot_val(__P010) ^
+ pgprot_val(__S010));
+}
+
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long vaddr = (unsigned long)vmf->virtual_address;
struct address_space *mapping = inode->i_mapping;
struct block_device *bdev = bh->b_bdev;
@@ -530,7 +565,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
.size = bh->b_size,
};
pgoff_t size;
- int error;
+ int result;
i_mmap_lock_read(mapping);
@@ -542,15 +577,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
* allocated past the end of the file.
*/
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- error = -EIO;
- goto out;
- }
+ if (unlikely(vmf->pgoff >= size))
+ goto sigbus;
- if (dax_map_atomic(bdev, &dax) < 0) {
- error = PTR_ERR(dax.addr);
- goto out;
- }
+ if (dax_map_atomic(bdev, &dax) < 0)
+ goto sigbus;
if (buffer_unwritten(bh) || buffer_new(bh)) {
clear_pmem(dax.addr, PAGE_SIZE);
@@ -558,17 +589,19 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
}
dax_unmap_atomic(bdev, &dax);
- error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
- vmf->flags & FAULT_FLAG_WRITE);
- if (error)
- goto out;
+ if (dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, write))
+ goto sigbus;
- error = vm_insert_mixed(vma, vaddr, dax.pfn);
+ result = vmf_insert_pfn_prot(vma, vaddr, dax.pfn,
+ dax_pgprot(vma, write));
out:
i_mmap_unlock_read(mapping);
+ return result;
- return error;
+ sigbus:
+ result = VM_FAULT_SIGBUS;
+ goto out;
}
/**
@@ -599,7 +632,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
unsigned blkbits = inode->i_blkbits;
sector_t block;
pgoff_t size;
- int error;
+ int result, error;
int major = 0;
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -701,19 +734,19 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* indicate what the callback should do via the uptodate variable, same
* as for normal BH based IO completions.
*/
- error = dax_insert_mapping(inode, &bh, vma, vmf);
+ result = dax_insert_mapping(inode, &bh, vma, vmf);
if (buffer_unwritten(&bh)) {
if (complete_unwritten)
- complete_unwritten(&bh, !error);
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
else
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ return result | major;
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if ((error < 0) && (error != -EBUSY))
+ if (error < 0)
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
--
2.7.0.rc3