Re: [PATCH v7 02/22] Allow page fault handlers to perform the COW

From: Jan Kara
Date: Tue Apr 08 2014 - 12:35:18 EST


On Sun 23-03-14 15:08:28, Matthew Wilcox wrote:
> Currently COW of an XIP file is done by first bringing in a read-only
> mapping, then retrying the fault and copying the page. It is much more
> efficient to tell the fault handler that a COW is being attempted (by
> passing in the pre-allocated page in the vm_fault structure), and allow
> the handler to perform the COW operation itself.
>
> Where the filemap code protects against truncation of the file until
> the PTE has been installed with the page lock, the XIP code use the
> i_mmap_mutex instead. We must therefore unlock the i_mmap_mutex after
> inserting the PTE.
Eww, leaking of locking details about DAX into generic fault code is
really ugly. It seems to me that once you pass the cow_page into the fault
handler (which looks OK to me), you can just directly install it in PTE
via vm_insert_page() and you don't have to rely on do_cow_fault() for that.
Thus you can return VM_FAULT_NOPAGE and be done with it? Basically cow
faults will then work the same way as other faults for DAX... Or am I
missing something?

Honza

> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@xxxxxxxxx>
> ---
> include/linux/mm.h | 2 ++
> mm/memory.c | 45 +++++++++++++++++++++++++++++++++------------
> 2 files changed, 35 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c1b7414..513b78a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -205,6 +205,7 @@ struct vm_fault {
> pgoff_t pgoff; /* Logical page offset based on vma */
> void __user *virtual_address; /* Faulting virtual address */
>
> + struct page *cow_page; /* Handler may choose to COW */
> struct page *page; /* ->fault handlers should return a
> * page here, unless VM_FAULT_NOPAGE
> * is set (which is also implied by
> @@ -1010,6 +1011,7 @@ static inline int page_mapped(struct page *page)
> #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
> #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
>
> +#define VM_FAULT_COWED 0x0080 /* ->fault COWed the page instead */
> #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
> #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
> #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
> diff --git a/mm/memory.c b/mm/memory.c
> index 07b4287..2a2ecac 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2602,6 +2602,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
> vmf.pgoff = page->index;
> vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
> vmf.page = page;
> + vmf.cow_page = NULL;
>
> ret = vma->vm_ops->page_mkwrite(vma, &vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> @@ -3288,7 +3289,8 @@ oom:
> }
>
> static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> - pgoff_t pgoff, unsigned int flags, struct page **page)
> + pgoff_t pgoff, unsigned int flags,
> + struct page *cow_page, struct page **page)
> {
> struct vm_fault vmf;
> int ret;
> @@ -3297,10 +3299,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> vmf.pgoff = pgoff;
> vmf.flags = flags;
> vmf.page = NULL;
> + vmf.cow_page = cow_page;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> return ret;
> + if (unlikely(ret & VM_FAULT_COWED))
> + goto out;
>
> if (unlikely(PageHWPoison(vmf.page))) {
> if (ret & VM_FAULT_LOCKED)
> @@ -3314,6 +3319,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
> else
> VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
>
> + out:
> *page = vmf.page;
> return ret;
> }
> @@ -3351,7 +3357,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> pte_t *pte;
> int ret;
>
> - ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> return ret;
>
> @@ -3368,6 +3374,12 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> return ret;
> }
>
> +/*
> + * If the fault handler performs the COW, it does not return a page,
> + * so cannot use the page's lock to protect against a concurrent truncate
> + * operation. Instead it returns with the i_mmap_mutex held, which must
> + * be released after the PTE has been inserted.
> + */
> static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmd,
> pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
> @@ -3389,25 +3401,34 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> return VM_FAULT_OOM;
> }
>
> - ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> goto uncharge_out;
>
> - copy_user_highpage(new_page, fault_page, address, vma);
> + if (!(ret & VM_FAULT_COWED))
> + copy_user_highpage(new_page, fault_page, address, vma);
> __SetPageUptodate(new_page);
>
> pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> - if (unlikely(!pte_same(*pte, orig_pte))) {
> - pte_unmap_unlock(pte, ptl);
> + if (unlikely(!pte_same(*pte, orig_pte)))
> + goto unlock_out;
> + do_set_pte(vma, address, new_page, pte, true, true);
> + pte_unmap_unlock(pte, ptl);
> + if (ret & VM_FAULT_COWED) {
> + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
> + } else {
> unlock_page(fault_page);
> page_cache_release(fault_page);
> - goto uncharge_out;
> }
> - do_set_pte(vma, address, new_page, pte, true, true);
> - pte_unmap_unlock(pte, ptl);
> - unlock_page(fault_page);
> - page_cache_release(fault_page);
> return ret;
> +unlock_out:
> + pte_unmap_unlock(pte, ptl);
> + if (ret & VM_FAULT_COWED) {
> + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
> + } else {
> + unlock_page(fault_page);
> + page_cache_release(fault_page);
> + }
> uncharge_out:
> mem_cgroup_uncharge_page(new_page);
> page_cache_release(new_page);
> @@ -3424,7 +3445,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> int dirtied = 0;
> int ret, tmp;
>
> - ret = __do_fault(vma, address, pgoff, flags, &fault_page);
> + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> return ret;
>
> --
> 1.9.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/