Re: [PATCH v2 4/5] mm: swap: entirely map large folios found in swapcache

From: Huang, Ying
Date: Mon Apr 15 2024 - 04:40:07 EST


Barry Song <21cnbao@xxxxxxxxx> writes:

> From: Chuanhua Han <hanchuanhua@xxxxxxxx>
>
> When a large folio is found in the swapcache, the current implementation
> requires calling do_swap_page() nr_pages times, resulting in nr_pages
> page faults. This patch opts to map the entire large folio at once to
> minimize page faults. Additionally, redundant checks and early exits
> for ARM64 MTE restoring are removed.
>
> Signed-off-by: Chuanhua Han <hanchuanhua@xxxxxxxx>
> Co-developed-by: Barry Song <v-songbaohua@xxxxxxxx>
> Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>
> ---
> mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 52 insertions(+), 12 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index c4a52e8d740a..9818dc1893c8 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> pte_t pte;
> vm_fault_t ret = 0;
> void *shadow = NULL;
> + int nr_pages = 1;
> + unsigned long start_address = vmf->address;
> + pte_t *start_pte = vmf->pte;

IMHO, it's better to rename the above 2 local variables to "address" and
"ptep". Just my personal opinion. Feel free to ignore the comments.

> + bool any_swap_shared = false;
>
> if (!pte_unmap_same(vmf))
> goto out;
> @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> */
> vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> &vmf->ptl);

We should move pte check here. That is,

if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
goto out_nomap;

This will simplify the situation for large folio.

> +
> + /* We hit large folios in swapcache */

The comments seems unnecessary because the code tells that already.

> + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
> + int nr = folio_nr_pages(folio);
> + int idx = folio_page_idx(folio, page);
> + unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> + unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> + pte_t *folio_ptep;
> + pte_t folio_pte;
> +
> + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> + goto check_pte;
> + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> + goto check_pte;
> +
> + folio_ptep = vmf->pte - idx;
> + folio_pte = ptep_get(folio_ptep);

It's better to construct pte based on fault PTE via generalizing
pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find
inconsistent PTEs quicker.

> + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> + goto check_pte;
> +
> + start_address = folio_start;
> + start_pte = folio_ptep;
> + nr_pages = nr;
> + entry = folio->swap;
> + page = &folio->page;
> + }
> +
> +check_pte:
> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> goto out_nomap;
>
> @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> */
> exclusive = false;
> }
> +
> + /* Reuse the whole large folio iff all entries are exclusive */
> + if (nr_pages > 1 && any_swap_shared)
> + exclusive = false;
> }
>
> /*
> @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> * We're already holding a reference on the page but haven't mapped it
> * yet.
> */
> - swap_free(entry);
> + swap_free_nr(entry, nr_pages);
> if (should_try_to_free_swap(folio, vma, vmf->flags))
> folio_free_swap(folio);
>
> - inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> - dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> + folio_ref_add(folio, nr_pages - 1);
> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> +
> pte = mk_pte(page, vma->vm_page_prot);
>
> /*
> @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> * exclusivity.
> */
> if (!folio_test_ksm(folio) &&
> - (exclusive || folio_ref_count(folio) == 1)) {
> + (exclusive || (folio_ref_count(folio) == nr_pages &&
> + folio_nr_pages(folio) == nr_pages))) {
> if (vmf->flags & FAULT_FLAG_WRITE) {
> pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> vmf->flags &= ~FAULT_FLAG_WRITE;
> }
> rmap_flags |= RMAP_EXCLUSIVE;
> }
> - flush_icache_page(vma, page);
> + flush_icache_pages(vma, page, nr_pages);
> if (pte_swp_soft_dirty(vmf->orig_pte))
> pte = pte_mksoft_dirty(pte);
> if (pte_swp_uffd_wp(vmf->orig_pte))
> pte = pte_mkuffd_wp(pte);
> - vmf->orig_pte = pte;
>
> /* ksm created a completely new copy */
> if (unlikely(folio != swapcache && swapcache)) {
> - folio_add_new_anon_rmap(folio, vma, vmf->address);
> + folio_add_new_anon_rmap(folio, vma, start_address);
> folio_add_lru_vma(folio, vma);
> } else {
> - folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
> - rmap_flags);
> + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
> + rmap_flags);
> }
>
> VM_BUG_ON(!folio_test_anon(folio) ||
> (pte_write(pte) && !PageAnonExclusive(page)));
> - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> + vmf->orig_pte = ptep_get(vmf->pte);
> + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);

Do we need to call arch_do_swap_page() for each subpage? IIUC, the
corresponding arch_unmap_one() will be called for each subpage.

> folio_unlock(folio);
> if (folio != swapcache && swapcache) {
> @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> }
>
> /* No need to invalidate - it was non-present before */
> - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
> unlock:
> if (vmf->pte)
> pte_unmap_unlock(vmf->pte, vmf->ptl);

--
Best Regards,
Huang, Ying