Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
From: Matthew Wilcox
Date: Tue Sep 24 2019 - 13:48:15 EST
On Tue, Sep 24, 2019 at 01:15:18PM -0400, Johannes Weiner wrote:
> +static int fault_dirty_shared_page(struct vm_fault *vmf)
vm_fault_t, shirley?
> @@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
> mapping = page_rmapping(page);
> unlock_page(page);
>
> + if (!page_mkwrite)
> + file_update_time(vma->vm_file);
> +
> + /*
> + * Throttle page dirtying rate down to writeback speed.
> + *
> + * mapping may be NULL here because some device drivers do not
> + * set page.mapping but still dirty their pages
> + *
> + * Drop the mmap_sem before waiting on IO, if we can. The file
> + * is pinning the mapping, as per above.
> + */
> if ((dirtied || page_mkwrite) && mapping) {
> - /*
> - * Some device drivers do not set page.mapping
> - * but still dirty their pages
> - */
> + struct file *fpin = NULL;
> +
> + if ((vmf->flags &
> + (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
> + FAULT_FLAG_ALLOW_RETRY) {
> + fpin = get_file(vma->vm_file);
> + up_read(&vma->vm_mm->mmap_sem);
> + ret = VM_FAULT_RETRY;
> + }
> +
> balance_dirty_pages_ratelimited(mapping);
> +
> + if (fpin)
> + fput(fpin);
> }
>
> - if (!page_mkwrite)
> - file_update_time(vma->vm_file);
> + return ret;
> }
I'm not a fan of moving file_update_time() to _before_ the
balance_dirty_pages call. Also, this is now the third place that needs
maybe_unlock_mmap_for_io, see
https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/
How about:
+ struct file *fpin = NULL;
if ((dirtied || page_mkwrite) && mapping) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
balance_dirty_pages_ratelimited(mapping);
}
+
+ if (fpin) {
+ file_update_time(fpin);
+ fput(fpin);
+ return VM_FAULT_RETRY;
+ }
if (!page_mkwrite)
file_update_time(vma->vm_file);
+ return 0;
}
> /*
> @@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> __releases(vmf->ptl)
> {
> struct vm_area_struct *vma = vmf->vma;
> + int ret = VM_FAULT_WRITE;
vm_fault_t again.
> @@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> static vm_fault_t do_fault(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> - struct mm_struct *vm_mm = vma->vm_mm;
> vm_fault_t ret;
>
> /*
> @@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
>
> /* preallocated pagetable is unused: free it */
> if (vmf->prealloc_pte) {
> - pte_free(vm_mm, vmf->prealloc_pte);
> + /*
> + * XXX: Accessing vma->vm_mm now is not safe. The page
> + * fault handler may have dropped the mmap_sem a long
> + * time ago. Only s390 derefs that parameter.
> + */
> + pte_free(vma->vm_mm, vmf->prealloc_pte);
I'm confused. This code looks like it was safe before (as it was caching
vma->vm_mm in a local variable), and now you've made it unsafe ... ?