Re: [PATCH 1/2] mm: Allow architectures to request 'old' entries when prefaulting

From: Kirill A. Shutemov
Date: Wed Dec 16 2020 - 12:08:00 EST


On Mon, Dec 14, 2020 at 09:54:06AM -0800, Linus Torvalds wrote:
> Could we have a "filemap_map_pmd()" that does it all, and then the
> filemap_map_pages() logic would be more along the lines of
>
> if (filemap_map_pmd(vmf, xas)) {
> rcu_read_unlock();
> return;
> }
>
> ... then handle pte's ...
>
> Hmm?

If this looks fine, I'll submit a proper patch.

diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..2825153ad0d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -534,8 +534,8 @@ struct vm_fault {
* is not NULL, otherwise pmd.
*/
pgtable_t prealloc_pte; /* Pre-allocated pte page table.
- * vm_ops->map_pages() calls
- * alloc_set_pte() from atomic context.
+ * vm_ops->map_pages() sets up a page
+ * table from from atomic context.
* do_fault_around() pre-allocates
* page table to avoid allocation from
* atomic context.
@@ -972,7 +972,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}

-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+void do_set_pte(struct vm_fault *vmf, struct page *page);
+
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e237004d498d..869c1921ceda 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1259,6 +1259,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#endif
}

+/*
+ * the ordering of these checks is important for pmds with _page_devmap set.
+ * if we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+ return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
#ifndef CONFIG_NUMA_BALANCING
/*
* Technically a PTE can be PROTNONE even when not doing NUMA balancing but
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b2067b3c328..a7e21cff4ede 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,7 @@
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
+#include <asm/pgalloc.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
@@ -2831,10 +2832,74 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);

+static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page,
+ struct xa_state *xas)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+
+ /* Huge page is mapped? No need to proceed. */
+ if (pmd_trans_huge(*vmf->pmd))
+ return true;
+
+ if (xa_is_value(page))
+ goto nohuge;
+
+ if (!pmd_none(*vmf->pmd))
+ goto nohuge;
+
+ if (!PageTransHuge(page) || PageLocked(page))
+ goto nohuge;
+
+ if (!page_cache_get_speculative(page))
+ goto nohuge;
+
+ if (page != xas_reload(xas))
+ goto unref;
+
+ if (!PageTransHuge(page))
+ goto unref;
+
+ if (!PageUptodate(page) || PageReadahead(page) || PageHWPoison(page))
+ goto unref;
+
+ if (!trylock_page(page))
+ goto unref;
+
+ if (page->mapping != mapping || !PageUptodate(page))
+ goto unlock;
+
+ if (xas->xa_index >= DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE))
+ goto unlock;
+
+ do_set_pmd(vmf, page);
+ unlock_page(page);
+ return true;
+unlock:
+ unlock_page(page);
+unref:
+ put_page(page);
+nohuge:
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (likely(pmd_none(*vmf->pmd))) {
+ mm_inc_nr_ptes(vma->vm_mm);
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ vmf->prealloc_pte = NULL;
+ }
+ spin_unlock(vmf->ptl);
+
+ /* See comment in handle_pte_fault() */
+ if (pmd_devmap_trans_unstable(vmf->pmd))
+ return true;
+
+ return false;
+}
+
void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
- struct file *file = vmf->vma->vm_file;
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
@@ -2843,20 +2908,36 @@ void filemap_map_pages(struct vm_fault *vmf,
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);

rcu_read_lock();
- xas_for_each(&xas, head, end_pgoff) {
+ head = xas_find(&xas, end_pgoff);
+ if (!head) {
+ rcu_read_unlock();
+ return;
+ }
+
+ while (xas_retry(&xas, head))
+ head = xas_next_entry(&xas, end_pgoff);
+
+ if (filemap_map_pmd(vmf, head, &xas)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+
+ for (; head; head = xas_next_entry(&xas, end_pgoff)) {
if (xas_retry(&xas, head))
continue;
if (xa_is_value(head))
- goto next;
-
+ continue;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
if (PageLocked(head))
- goto next;
+ continue;
if (!page_cache_get_speculative(head))
- goto next;
+ continue;

/* Has the page moved or been split? */
if (unlikely(head != xas_reload(&xas)))
@@ -2884,19 +2965,18 @@ void filemap_map_pages(struct vm_fault *vmf,
if (vmf->pte)
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- if (alloc_set_pte(vmf, page))
- goto unlock;
+ if (pte_none(*vmf->pte))
+ do_set_pte(vmf, page);
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock_page(head);
- goto next;
+ continue;
unlock:
unlock_page(head);
skip:
put_page(head);
-next:
- /* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*vmf->pmd))
- break;
}
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
}
diff --git a/mm/memory.c b/mm/memory.c
index c48f8df6e502..96d62774096a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3490,7 +3490,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;

- /* See the comment in pte_alloc_one_map() */
+ /* See the comment in map_set_pte() */
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;

@@ -3630,66 +3630,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
}

-/*
- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
- */
-static int pmd_devmap_trans_unstable(pmd_t *pmd)
-{
- return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
-}
-
-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
-
- if (!pmd_none(*vmf->pmd))
- goto map_pte;
- if (vmf->prealloc_pte) {
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_none(*vmf->pmd))) {
- spin_unlock(vmf->ptl);
- goto map_pte;
- }
-
- mm_inc_nr_ptes(vma->vm_mm);
- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
- spin_unlock(vmf->ptl);
- vmf->prealloc_pte = NULL;
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
- return VM_FAULT_OOM;
- }
-map_pte:
- /*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
- * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
- * under us and then back to pmd_none, as a result of MADV_DONTNEED
- * running immediately after a huge pmd fault in a different thread of
- * this mm, in turn leading to a misleading pmd_trans_huge() retval.
- * All we have to ensure is that it is a regular pmd that we can walk
- * with pte_offset_map() and we can do that through an atomic read in
- * C, which is what pmd_trans_unstable() provides.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return VM_FAULT_NOPAGE;
-
- /*
- * At this point we know that our vmf->pmd points to a page of ptes
- * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
- * for the duration of the fault. If a racing MADV_DONTNEED runs and
- * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
- * be valid and we will re-check to make sure the vmf->pte isn't
- * pte_none() under vmf->ptl protection when we return to
- * alloc_set_pte().
- */
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
- return 0;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
@@ -3704,7 +3644,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}

-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3769,45 +3709,11 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
}
#endif

-/**
- * alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the function allocates page table or use pre-allocated.
- *
- * @vmf: fault environment
- * @page: page to map
- *
- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
- * return.
- *
- * Target users are page handler itself and implementations of
- * vm_ops->map_pages.
- *
- * Return: %0 on success, %VM_FAULT_ code in case of error.
- */
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
+void do_set_pte(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
- vm_fault_t ret;
-
- if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
- if (ret != VM_FAULT_FALLBACK)
- return ret;
- }
-
- if (!vmf->pte) {
- ret = pte_alloc_one_map(vmf);
- if (ret)
- return ret;
- }
-
- /* Re-check under ptl */
- if (unlikely(!pte_none(*vmf->pte))) {
- update_mmu_tlb(vma, vmf->address, vmf->pte);
- return VM_FAULT_NOPAGE;
- }

flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
@@ -3824,14 +3730,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
page_add_file_rmap(page, false);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
-
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, vmf->address, vmf->pte);
-
- return 0;
}

-
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
@@ -3849,12 +3749,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct page *page;
- vm_fault_t ret = 0;
+ vm_fault_t ret;

/* Did we COW the page? */
- if ((vmf->flags & FAULT_FLAG_WRITE) &&
- !(vmf->vma->vm_flags & VM_SHARED))
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
@@ -3863,13 +3763,35 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* check even for read faults because we might have lost our CoWed
* page
*/
- if (!(vmf->vma->vm_flags & VM_SHARED))
- ret = check_stable_address_space(vmf->vma->vm_mm);
- if (!ret)
- ret = alloc_set_pte(vmf, page);
- if (vmf->pte)
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return ret;
+ if (!(vma->vm_flags & VM_SHARED))
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ return ret;
+
+ if (pmd_none(*vmf->pmd)) {
+ if (PageTransCompound(page)) {
+ ret = do_set_pmd(vmf, page);
+ if (ret != VM_FAULT_FALLBACK)
+ return ret;
+ }
+
+ if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
+ return VM_FAULT_OOM;
+ }
+
+ /* See comment in handle_pte_fault() */
+ if (pmd_devmap_trans_unstable(vmf->pmd))
+ return 0;
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ /* Re-check under ptl */
+ if (likely(pte_none(*vmf->pte)))
+ do_set_pte(vmf, page);
+
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
}

static unsigned long fault_around_bytes __read_mostly =
@@ -3980,7 +3902,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
vmf->address = address;
vmf->pte = NULL;
@@ -4340,7 +4261,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
*/
vmf->pte = NULL;
} else {
- /* See comment in pte_alloc_one_map() */
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
+ * of pmd_trans_huge() to ensure the pmd didn't become
+ * pmd_trans_huge under us and then back to pmd_none, as a
+ * result of MADV_DONTNEED running immediately after a huge pmd
+ * fault in a different thread of this mm, in turn leading to a
+ * misleading pmd_trans_huge() retval. All we have to ensure is
+ * that it is a regular pmd that we can walk with
+ * pte_offset_map() and we can do that through an atomic read
+ * in C, which is what pmd_trans_unstable() provides.
+ */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
--
Kirill A. Shutemov