Re: [PATCH 02/17] mm: mmu_gather rework

From: Mel Gorman
Date: Thu Mar 10 2011 - 10:51:11 EST


On Thu, Feb 17, 2011 at 05:23:29PM +0100, Peter Zijlstra wrote:
> Remove the first obstackle towards a fully preemptible mmu_gather.
>
> The current scheme assumes mmu_gather is always done with preemption
> disabled and uses per-cpu storage for the page batches. Change this to
> try and allocate a page for batching and in case of failure, use a
> small on-stack array to make some progress.
>
> Preemptible mmu_gather is desired in general and usable once
> i_mmap_lock becomes a mutex. Doing it before the mutex conversion
> saves us from having to rework the code by moving the mmu_gather
> bits inside the pte_lock.
>
> Also avoid flushing the tlb batches from under the pte lock,
> this is useful even without the i_mmap_lock conversion as it
> significantly reduces pte lock hold times.
>
> Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
> Cc: David Miller <davem@xxxxxxxxxxxxx>
> Cc: Martin Schwidefsky <schwidefsky@xxxxxxxxxx>
> Cc: Russell King <rmk@xxxxxxxxxxxxxxxx>
> Cc: Paul Mundt <lethal@xxxxxxxxxxxx>
> Cc: Jeff Dike <jdike@xxxxxxxxxxx>
> Cc: Tony Luck <tony.luck@xxxxxxxxx>
> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> Acked-by: Hugh Dickins <hughd@xxxxxxxxxx>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> ---
> fs/exec.c | 10 ++---
> include/asm-generic/tlb.h | 77 ++++++++++++++++++++++++++++++++--------------
> include/linux/mm.h | 2 -
> mm/memory.c | 42 ++++++++++---------------
> mm/mmap.c | 18 +++++-----
> 5 files changed, 87 insertions(+), 62 deletions(-)
>
> Index: linux-2.6/fs/exec.c
> ===================================================================
> --- linux-2.6.orig/fs/exec.c
> +++ linux-2.6/fs/exec.c
> @@ -550,7 +550,7 @@ static int shift_arg_pages(struct vm_are
> unsigned long length = old_end - old_start;
> unsigned long new_start = old_start - shift;
> unsigned long new_end = old_end - shift;
> - struct mmu_gather *tlb;
> + struct mmu_gather tlb;
>
> BUG_ON(new_start > new_end);
>
> @@ -576,12 +576,12 @@ static int shift_arg_pages(struct vm_are
> return -ENOMEM;
>
> lru_add_drain();
> - tlb = tlb_gather_mmu(mm, 0);
> + tlb_gather_mmu(&tlb, mm, 0);
> if (new_end > old_start) {
> /*
> * when the old and new regions overlap clear from new_end.
> */
> - free_pgd_range(tlb, new_end, old_end, new_end,
> + free_pgd_range(&tlb, new_end, old_end, new_end,
> vma->vm_next ? vma->vm_next->vm_start : 0);
> } else {
> /*
> @@ -590,10 +590,10 @@ static int shift_arg_pages(struct vm_are
> * have constraints on va-space that make this illegal (IA64) -
> * for the others its just a little faster.
> */
> - free_pgd_range(tlb, old_start, old_end, new_end,
> + free_pgd_range(&tlb, old_start, old_end, new_end,
> vma->vm_next ? vma->vm_next->vm_start : 0);
> }
> - tlb_finish_mmu(tlb, new_end, old_end);
> + tlb_finish_mmu(&tlb, new_end, old_end);
>
> /*
> * Shrink the vma to just the new range. Always succeeds.
> Index: linux-2.6/include/asm-generic/tlb.h
> ===================================================================
> --- linux-2.6.orig/include/asm-generic/tlb.h
> +++ linux-2.6/include/asm-generic/tlb.h
> @@ -5,6 +5,8 @@
> * Copyright 2001 Red Hat, Inc.
> * Based on code from mm/memory.c Copyright Linus Torvalds and others.
> *
> + * Copyright 2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx>
> + *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> @@ -22,51 +24,69 @@
> * and page free order so much..
> */
> #ifdef CONFIG_SMP
> - #ifdef ARCH_FREE_PTR_NR
> - #define FREE_PTR_NR ARCH_FREE_PTR_NR
> - #else
> - #define FREE_PTE_NR 506
> - #endif
> #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
> #else
> - #define FREE_PTE_NR 1
> #define tlb_fast_mode(tlb) 1
> #endif
>
> +/*
> + * If we can't allocate a page to make a big patch of page pointers
> + * to work on, then just handle a few from the on-stack structure.
> + */
> +#define MMU_GATHER_BUNDLE 8
> +
> /* struct mmu_gather is an opaque type used by the mm code for passing around
> * any data needed by arch specific code for tlb_remove_page.
> */
> struct mmu_gather {
> struct mm_struct *mm;
> unsigned int nr; /* set to ~0U means fast mode */
> + unsigned int max; /* nr < max */
> unsigned int need_flush;/* Really unmapped some ptes? */
> unsigned int fullmm; /* non-zero means full mm flush */
> - struct page * pages[FREE_PTE_NR];
> +#ifdef HAVE_ARCH_MMU_GATHER
> + struct arch_mmu_gather arch;
> +#endif
> + struct page **pages;
> + struct page *local[MMU_GATHER_BUNDLE];
> };
>
> -/* Users of the generic TLB shootdown code must declare this storage space. */
> -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
> +static inline void __tlb_alloc_page(struct mmu_gather *tlb)
> +{
> + unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
> +
> + if (addr) {
> + tlb->pages = (void *)addr;
> + tlb->max = PAGE_SIZE / sizeof(struct page *);
> + }
> +}
>
> /* tlb_gather_mmu
> * Return a pointer to an initialized struct mmu_gather.
> */
> -static inline struct mmu_gather *
> -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
> +static inline void
> +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush)
> {

checkpatch will bitch about line length.

> - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
> -
> tlb->mm = mm;
>
> - /* Use fast mode if only one CPU is online */
> - tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
> + tlb->max = ARRAY_SIZE(tlb->local);
> + tlb->pages = tlb->local;
> +
> + if (num_online_cpus() > 1) {
> + tlb->nr = 0;
> + __tlb_alloc_page(tlb);
> + } else /* Use fast mode if only one CPU is online */
> + tlb->nr = ~0U;
>
> tlb->fullmm = full_mm_flush;
>
> - return tlb;
> +#ifdef HAVE_ARCH_MMU_GATHER
> + tlb->arch = ARCH_MMU_GATHER_INIT;
> +#endif
> }
>
> static inline void
> -tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
> +tlb_flush_mmu(struct mmu_gather *tlb)

Removing start/end here is a harmless, but unrelated cleanup. Is it
worth keeping start/end on the rough off-chance the information is ever
used to limit what portion of the TLB is flushed?

> {
> if (!tlb->need_flush)
> return;
> @@ -75,6 +95,8 @@ tlb_flush_mmu(struct mmu_gather *tlb, un
> if (!tlb_fast_mode(tlb)) {
> free_pages_and_swap_cache(tlb->pages, tlb->nr);
> tlb->nr = 0;
> + if (tlb->pages == tlb->local)
> + __tlb_alloc_page(tlb);
> }

That needs a comment. Something like

/*
* If we are using the local on-stack array of pages for MMU gather,
* try allocation again as we have recently freed pages
*/

> }
>
> @@ -85,12 +107,13 @@ tlb_flush_mmu(struct mmu_gather *tlb, un
> static inline void
> tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
> {
> - tlb_flush_mmu(tlb, start, end);
> + tlb_flush_mmu(tlb);
>
> /* keep the page table cache within bounds */
> check_pgt_cache();
>
> - put_cpu_var(mmu_gathers);
> + if (tlb->pages != tlb->local)
> + free_pages((unsigned long)tlb->pages, 0);
> }
>
> /* tlb_remove_page
> @@ -98,16 +121,24 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
> * handling the additional races in SMP caused by other CPUs caching valid
> * mappings in their TLBs.
> */
> -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
> +static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
> {

What does this return value mean?

Looking at the function, its obvious that 1 is returned when pages[] is full
and needs to be freed, TLB flushed, etc. However, callers refer the return
value as "need_flush" where as this function sets tlb->need_flush but the
two values have different meaning: retval need_flush means the array is full
and must be emptied where as tlb->need_flush just says there are some pages
that need to be freed.

It's a nit-pick but how about having it return the number of array slots
that are still available like what pagevec_add does? It would allow you
to get rid of the slighty-different need_flush variable in mm/memory.c

> tlb->need_flush = 1;
> if (tlb_fast_mode(tlb)) {
> free_page_and_swap_cache(page);
> - return;
> + return 0;
> }
> tlb->pages[tlb->nr++] = page;
> - if (tlb->nr >= FREE_PTE_NR)
> - tlb_flush_mmu(tlb, 0, 0);
> + if (tlb->nr >= tlb->max)
> + return 1;
> +

Use == and VM_BUG_ON(tlb->nr > tlb->max) ?

> + return 0;
> +}
> +
> +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
> +{
> + if (__tlb_remove_page(tlb, page))
> + tlb_flush_mmu(tlb);
> }
>
> /**
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -889,7 +889,7 @@ int zap_vma_ptes(struct vm_area_struct *
> unsigned long size);
> unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
> unsigned long size, struct zap_details *);
> -unsigned long unmap_vmas(struct mmu_gather **tlb,
> +unsigned long unmap_vmas(struct mmu_gather *tlb,
> struct vm_area_struct *start_vma, unsigned long start_addr,
> unsigned long end_addr, unsigned long *nr_accounted,
> struct zap_details *);
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c
> +++ linux-2.6/mm/memory.c
> @@ -912,12 +912,13 @@ static unsigned long zap_pte_range(struc
> long *zap_work, struct zap_details *details)
> {
> struct mm_struct *mm = tlb->mm;
> + int need_flush = 0;
> pte_t *pte;
> spinlock_t *ptl;
> int rss[NR_MM_COUNTERS];
>
> init_rss_vec(rss);
> -
> +again:
> pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> arch_enter_lazy_mmu_mode();
> do {
> @@ -974,7 +975,7 @@ static unsigned long zap_pte_range(struc
> page_remove_rmap(page);
> if (unlikely(page_mapcount(page) < 0))
> print_bad_pte(vma, addr, ptent, page);
> - tlb_remove_page(tlb, page);
> + need_flush = __tlb_remove_page(tlb, page);
> continue;

So, if __tlb_remove_page() returns 1 (should be bool for true/false) the
caller is expected to call tlb_flush_mmu(). We call continue and as a
side-effect break out of the loop unlocking various bits and pieces and
restarted.

It'd be a hell of a lot clearer to just say

if (__tlb_remove_page(tlb, page))
break;

and not check !need_flush on each iteration.

> }
> /*
> @@ -995,12 +996,20 @@ static unsigned long zap_pte_range(struc
> print_bad_pte(vma, addr, ptent, NULL);
> }
> pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> - } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
> + } while (pte++, addr += PAGE_SIZE,
> + (addr != end && *zap_work > 0 && !need_flush));
>
> add_mm_rss_vec(mm, rss);
> arch_leave_lazy_mmu_mode();
> pte_unmap_unlock(pte - 1, ptl);
>
> + if (need_flush) {
> + need_flush = 0;
> + tlb_flush_mmu(tlb);
> + if (addr != end)
> + goto again;
> + }

So, I think the reasoning here is to update counters and release locks
regularly while tearing down pagetables. If this is true, it could do with
a comment explaining that's the intention. You can also obviate the need
for the local need_flush here with just if (tlb->need_flush), right?

> +
> return addr;
> }
>
> @@ -1121,17 +1130,14 @@ static unsigned long unmap_page_range(st
> * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
> * drops the lock and schedules.
> */
> -unsigned long unmap_vmas(struct mmu_gather **tlbp,
> +unsigned long unmap_vmas(struct mmu_gather *tlb,
> struct vm_area_struct *vma, unsigned long start_addr,
> unsigned long end_addr, unsigned long *nr_accounted,
> struct zap_details *details)
> {
> long zap_work = ZAP_BLOCK_SIZE;
> - unsigned long tlb_start = 0; /* For tlb_finish_mmu */
> - int tlb_start_valid = 0;
> unsigned long start = start_addr;
> spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
> - int fullmm = (*tlbp)->fullmm;
> struct mm_struct *mm = vma->vm_mm;
>
> mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
> @@ -1152,11 +1158,6 @@ unsigned long unmap_vmas(struct mmu_gath
> untrack_pfn_vma(vma, 0, 0);
>
> while (start != end) {
> - if (!tlb_start_valid) {
> - tlb_start = start;
> - tlb_start_valid = 1;
> - }
> -
> if (unlikely(is_vm_hugetlb_page(vma))) {
> /*
> * It is undesirable to test vma->vm_file as it
> @@ -1177,7 +1178,7 @@ unsigned long unmap_vmas(struct mmu_gath
>
> start = end;
> } else
> - start = unmap_page_range(*tlbp, vma,
> + start = unmap_page_range(tlb, vma,
> start, end, &zap_work, details);
>
> if (zap_work > 0) {
> @@ -1185,19 +1186,13 @@ unsigned long unmap_vmas(struct mmu_gath
> break;
> }
>
> - tlb_finish_mmu(*tlbp, tlb_start, start);
> -
> if (need_resched() ||
> (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
> - if (i_mmap_lock) {
> - *tlbp = NULL;
> + if (i_mmap_lock)
> goto out;
> - }
> cond_resched();
> }
>
> - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
> - tlb_start_valid = 0;
> zap_work = ZAP_BLOCK_SIZE;
> }
> }
> @@ -1217,16 +1212,15 @@ unsigned long zap_page_range(struct vm_a
> unsigned long size, struct zap_details *details)
> {
> struct mm_struct *mm = vma->vm_mm;
> - struct mmu_gather *tlb;
> + struct mmu_gather tlb;
> unsigned long end = address + size;
> unsigned long nr_accounted = 0;
>
> lru_add_drain();
> - tlb = tlb_gather_mmu(mm, 0);
> + tlb_gather_mmu(&tlb, mm, 0);
> update_hiwater_rss(mm);
> end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
> - if (tlb)
> - tlb_finish_mmu(tlb, address, end);
> + tlb_finish_mmu(&tlb, address, end);
> return end;
> }
>
> Index: linux-2.6/mm/mmap.c
> ===================================================================
> --- linux-2.6.orig/mm/mmap.c
> +++ linux-2.6/mm/mmap.c
> @@ -1913,17 +1913,17 @@ static void unmap_region(struct mm_struc
> unsigned long start, unsigned long end)
> {
> struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
> - struct mmu_gather *tlb;
> + struct mmu_gather tlb;
> unsigned long nr_accounted = 0;
>
> lru_add_drain();
> - tlb = tlb_gather_mmu(mm, 0);
> + tlb_gather_mmu(&tlb, mm, 0);
> update_hiwater_rss(mm);
> unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
> vm_unacct_memory(nr_accounted);
> - free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
> - next? next->vm_start: 0);
> - tlb_finish_mmu(tlb, start, end);
> + free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> + next ? next->vm_start : 0);
> + tlb_finish_mmu(&tlb, start, end);
> }
>
> /*
> @@ -2265,7 +2265,7 @@ EXPORT_SYMBOL(do_brk);
> /* Release all mmaps. */
> void exit_mmap(struct mm_struct *mm)
> {
> - struct mmu_gather *tlb;
> + struct mmu_gather tlb;
> struct vm_area_struct *vma;
> unsigned long nr_accounted = 0;
> unsigned long end;
> @@ -2290,14 +2290,14 @@ void exit_mmap(struct mm_struct *mm)
>
> lru_add_drain();
> flush_cache_mm(mm);
> - tlb = tlb_gather_mmu(mm, 1);
> + tlb_gather_mmu(&tlb, mm, 1);
> /* update_hiwater_rss(mm) here? but nobody should be looking */
> /* Use -1 here to ensure all VMAs in the mm are unmapped */
> end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
> vm_unacct_memory(nr_accounted);
>
> - free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
> - tlb_finish_mmu(tlb, 0, end);
> + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
> + tlb_finish_mmu(&tlb, 0, end);
>
> /*
> * Walk the list again, actually closing and freeing it,
>

Functionally I didn't see any problems. Comments are more about form
than function. Whether you apply them or not

Acked-by: Mel Gorman <mel@xxxxxxxxx>


--
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/