Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem

From: Hillf Danton
Date: Thu Jun 16 2016 - 02:53:18 EST


>
> From: Ebru Akagunduz <ebru.akagunduz@xxxxxxxxx>
>
> Currently khugepaged makes swapin readahead under down_write. This patch
> supplies to make swapin readahead under down_read instead of down_write.
>
> The patch was tested with a test program that allocates 800MB of memory,
> writes to it, and then sleeps. The system was forced to swap out all.
> Afterwards, the test program touches the area by writing, it skips a page
> in each 20 pages of the area.
>
> Link: http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagunduz@xxxxxxxxx
> Signed-off-by: Ebru Akagunduz <ebru.akagunduz@xxxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
> Cc: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Cc: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
> Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> Cc: Vlastimil Babka <vbabka@xxxxxxx>
> Cc: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxx>
> Cc: Minchan Kim <minchan.kim@xxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> ---
> mm/huge_memory.c | 92 ++++++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 63 insertions(+), 29 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f2bc57c45d2f..96dfe3f09bf6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
> }
>
> /*
> + * If mmap_sem temporarily dropped, revalidate vma
> + * before taking mmap_sem.

See below

> + * Return 0 if succeeds, otherwise return none-zero
> + * value (scan code).
> + */
> +
> +static int hugepage_vma_revalidate(struct mm_struct *mm,
> + struct vm_area_struct *vma,
> + unsigned long address)
> +{
> + unsigned long hstart, hend;
> +
> + if (unlikely(khugepaged_test_exit(mm)))
> + return SCAN_ANY_PROCESS;
> +
> + vma = find_vma(mm, address);
> + if (!vma)
> + return SCAN_VMA_NULL;
> +
> + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> + hend = vma->vm_end & HPAGE_PMD_MASK;
> + if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> + return SCAN_ADDRESS_RANGE;
> + if (!hugepage_vma_check(vma))
> + return SCAN_VMA_CHECK;
> + return 0;
> +}
> +
> +/*
> * Bring missing pages in from swap, to complete THP collapse.
> * Only done if khugepaged_scan_pmd believes it is worthwhile.
> *
> @@ -2385,7 +2414,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
> * but with mmap_sem held to protect against vma changes.
> */
>
> -static void __collapse_huge_page_swapin(struct mm_struct *mm,
> +static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmd)
> {
> @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
> continue;
> swapped_in++;
> ret = do_swap_page(mm, vma, _address, pte, pmd,
> - FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> + FAULT_FLAG_ALLOW_RETRY,

Add a description in change log for it please.

> pteval);
> + /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> + if (ret & VM_FAULT_RETRY) {
> + down_read(&mm->mmap_sem);
> + /* vma is no longer available, don't continue to swapin */
> + if (hugepage_vma_revalidate(mm, vma, address))
> + return false;

Revalidate vma _after_ acquiring mmap_sem, but the above comment says _before_.

> + }
> if (ret & VM_FAULT_ERROR) {
> trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
> - return;
> + return false;
> }
> /* pte is unmapped now, we need to map it */
> pte = pte_offset_map(pmd, _address);
> @@ -2413,6 +2449,7 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
> pte--;
> pte_unmap(pte);
> trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
> + return true;
> }
>
> static void collapse_huge_page(struct mm_struct *mm,
> @@ -2427,7 +2464,6 @@ static void collapse_huge_page(struct mm_struct *mm,
> struct page *new_page;
> spinlock_t *pmd_ptl, *pte_ptl;
> int isolated = 0, result = 0;
> - unsigned long hstart, hend;
> struct mem_cgroup *memcg;
> unsigned long mmun_start; /* For mmu_notifiers */
> unsigned long mmun_end; /* For mmu_notifiers */
> @@ -2450,39 +2486,37 @@ static void collapse_huge_page(struct mm_struct *mm,
> goto out_nolock;
> }
>
> - /*
> - * Prevent all access to pagetables with the exception of
> - * gup_fast later hanlded by the ptep_clear_flush and the VM
> - * handled by the anon_vma lock + PG_lock.
> - */
> - down_write(&mm->mmap_sem);
> - if (unlikely(khugepaged_test_exit(mm))) {
> - result = SCAN_ANY_PROCESS;
> + down_read(&mm->mmap_sem);
> + result = hugepage_vma_revalidate(mm, vma, address);
> + if (result)
> goto out;
> - }
>
> - vma = find_vma(mm, address);
> - if (!vma) {
> - result = SCAN_VMA_NULL;
> - goto out;
> - }
> - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> - hend = vma->vm_end & HPAGE_PMD_MASK;
> - if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
> - result = SCAN_ADDRESS_RANGE;
> - goto out;
> - }
> - if (!hugepage_vma_check(vma)) {
> - result = SCAN_VMA_CHECK;
> - goto out;
> - }
> pmd = mm_find_pmd(mm, address);
> if (!pmd) {
> result = SCAN_PMD_NULL;
> goto out;
> }
>
> - __collapse_huge_page_swapin(mm, vma, address, pmd);
> + /*
> + * __collapse_huge_page_swapin always returns with mmap_sem
> + * locked. If it fails, release mmap_sem and jump directly
> + * label out. Continuing to collapse causes inconsistency.
> + */
> + if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> + up_read(&mm->mmap_sem);
> + goto out;

Jump out with mmap_sem released,

> + }
> +
> + up_read(&mm->mmap_sem);
> + /*
> + * Prevent all access to pagetables with the exception of
> + * gup_fast later handled by the ptep_clear_flush and the VM
> + * handled by the anon_vma lock + PG_lock.
> + */
> + down_write(&mm->mmap_sem);
> + result = hugepage_vma_revalidate(mm, vma, address);
> + if (result)
> + goto out;

but jump out again with mmap_sem held.

They are cleaned up in subsequent darns?

>
> anon_vma_lock_write(vma->anon_vma);
>
> --
> 2.8.1