Re: [RFC 13/20] mm/tlb: introduce tlb_start_ptes() and tlb_end_ptes()

From: Damian Tometzki
Date: Sun Jan 31 2021 - 06:20:42 EST


On Sat, 30. Jan 16:11, Nadav Amit wrote:
> From: Nadav Amit <namit@xxxxxxxxxx>
>
> Introduce tlb_start_ptes() and tlb_end_ptes() which would be called
> before and after PTEs are updated and TLB flushes are deferred. This
> will be later be used for fine granualrity deferred TLB flushing
> detection.
>
> In the meanwhile, move flush_tlb_batched_pending() into
> tlb_start_ptes(). It was not called from mapping_dirty_helpers by
> wp_pte() and clean_record_pte(), which might be a bug.
>
> No additional functional change is intended.
>
> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Andy Lutomirski <luto@xxxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Will Deacon <will@xxxxxxxxxx>
> Cc: Yu Zhao <yuzhao@xxxxxxxxxx>
> Cc: Nick Piggin <npiggin@xxxxxxxxx>
> Cc: x86@xxxxxxxxxx
> ---
> fs/proc/task_mmu.c | 2 ++
> include/asm-generic/tlb.h | 18 ++++++++++++++++++
> mm/madvise.c | 6 ++++--
> mm/mapping_dirty_helpers.c | 15 +++++++++++++--
> mm/memory.c | 2 ++
> mm/mprotect.c | 3 ++-
> 6 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 4cd048ffa0f6..d0cce961fa5c 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1168,6 +1168,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
> return 0;
>
> pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> + tlb_start_ptes(&cp->tlb);
> for (; addr != end; pte++, addr += PAGE_SIZE) {
> ptent = *pte;
>
> @@ -1190,6 +1191,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
> tlb_flush_pte_range(&cp->tlb, addr, PAGE_SIZE);
> ClearPageReferenced(page);
> }
> + tlb_end_ptes(&cp->tlb);
> pte_unmap_unlock(pte - 1, ptl);
> cond_resched();
> return 0;
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 041be2ef4426..10690763090a 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -58,6 +58,11 @@
> * Defaults to flushing at tlb_end_vma() to reset the range; helps when
> * there's large holes between the VMAs.
> *
> + * - tlb_start_ptes() / tlb_end_ptes; makr the start / end of PTEs change.
Hello Nadav,

short nid makr/mark

Damian


> + *
> + * Does internal accounting to allow fine(r) granularity checks for
> + * pte_accessible() on certain configuration.
> + *
> * - tlb_remove_table()
> *
> * tlb_remove_table() is the basic primitive to free page-table directories
> @@ -373,6 +378,10 @@ static inline void tlb_flush(struct mmu_gather *tlb)
> flush_tlb_range(tlb->vma, tlb->start, tlb->end);
> }
> }
> +#endif
> +
> +#if __is_defined(tlb_flush) || \
> + IS_ENABLED(CONFIG_ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING)
>
> static inline void
> tlb_update_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
> @@ -523,6 +532,15 @@ static inline void mark_mm_tlb_gen_done(struct mm_struct *mm, u64 gen)
>
> #endif /* CONFIG_ARCH_HAS_TLB_GENERATIONS */
>
> +#define tlb_start_ptes(tlb) \
> + do { \
> + struct mmu_gather *_tlb = (tlb); \
> + \
> + flush_tlb_batched_pending(_tlb->mm); \
> + } while (0)
> +
> +static inline void tlb_end_ptes(struct mmu_gather *tlb) { }
> +
> /*
> * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
> * and set corresponding cleared_*.
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 0938fd3ad228..932c1c2eb9a3 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -392,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> #endif
> tlb_change_page_size(tlb, PAGE_SIZE);
> orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> - flush_tlb_batched_pending(mm);
> + tlb_start_ptes(tlb);
> arch_enter_lazy_mmu_mode();
> for (; addr < end; pte++, addr += PAGE_SIZE) {
> ptent = *pte;
> @@ -468,6 +468,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> }
>
> arch_leave_lazy_mmu_mode();
> + tlb_end_ptes(tlb);
> pte_unmap_unlock(orig_pte, ptl);
> if (pageout)
> reclaim_pages(&page_list);
> @@ -588,7 +589,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>
> tlb_change_page_size(tlb, PAGE_SIZE);
> orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> - flush_tlb_batched_pending(mm);
> + tlb_start_ptes(tlb);
> arch_enter_lazy_mmu_mode();
> for (; addr != end; pte++, addr += PAGE_SIZE) {
> ptent = *pte;
> @@ -692,6 +693,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> add_mm_counter(mm, MM_SWAPENTS, nr_swap);
> }
> arch_leave_lazy_mmu_mode();
> + tlb_end_ptes(tlb);
> pte_unmap_unlock(orig_pte, ptl);
> cond_resched();
> next:
> diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
> index 2ce6cf431026..063419ade304 100644
> --- a/mm/mapping_dirty_helpers.c
> +++ b/mm/mapping_dirty_helpers.c
> @@ -6,6 +6,8 @@
> #include <asm/cacheflush.h>
> #include <asm/tlb.h>
>
> +#include "internal.h"
> +
> /**
> * struct wp_walk - Private struct for pagetable walk callbacks
> * @range: Range for mmu notifiers
> @@ -36,7 +38,10 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
> pte_t ptent = *pte;
>
> if (pte_write(ptent)) {
> - pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
> + pte_t old_pte;
> +
> + tlb_start_ptes(&wpwalk->tlb);
> + old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
>
> ptent = pte_wrprotect(old_pte);
> ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
> @@ -44,6 +49,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
>
> if (pte_may_need_flush(old_pte, ptent))
> tlb_flush_pte_range(&wpwalk->tlb, addr, PAGE_SIZE);
> + tlb_end_ptes(&wpwalk->tlb);
> }
>
> return 0;
> @@ -94,13 +100,18 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
> if (pte_dirty(ptent)) {
> pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
> walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
> - pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
> + pte_t old_pte;
> +
> + tlb_start_ptes(&wpwalk->tlb);
> +
> + old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
>
> ptent = pte_mkclean(old_pte);
> ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
>
> wpwalk->total++;
> tlb_flush_pte_range(&wpwalk->tlb, addr, PAGE_SIZE);
> + tlb_end_ptes(&wpwalk->tlb);
>
> __set_bit(pgoff, cwalk->bitmap);
> cwalk->start = min(cwalk->start, pgoff);
> diff --git a/mm/memory.c b/mm/memory.c
> index 9e8576a83147..929a93c50d9a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1221,6 +1221,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> init_rss_vec(rss);
> start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> pte = start_pte;
> + tlb_start_ptes(tlb);
> flush_tlb_batched_pending(mm);
> arch_enter_lazy_mmu_mode();
> do {
> @@ -1314,6 +1315,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> add_mm_rss_vec(mm, rss);
> arch_leave_lazy_mmu_mode();
>
> + tlb_end_ptes(tlb);
> /* Do the actual TLB flush before dropping ptl */
> if (force_flush)
> tlb_flush_mmu_tlbonly(tlb);
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index b7473d2c9a1f..1258bbe42ee1 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -70,7 +70,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
> atomic_read(&vma->vm_mm->mm_users) == 1)
> target_node = numa_node_id();
>
> - flush_tlb_batched_pending(vma->vm_mm);
> + tlb_start_ptes(tlb);
> arch_enter_lazy_mmu_mode();
> do {
> oldpte = *pte;
> @@ -182,6 +182,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
> }
> } while (pte++, addr += PAGE_SIZE, addr != end);
> arch_leave_lazy_mmu_mode();
> + tlb_end_ptes(tlb);
> pte_unmap_unlock(pte - 1, ptl);
>
> return pages;
> --
> 2.25.1
>
>