[PATCH 25/25] mm, arch: Ensure we never tlb_flush_mmu() from atomic context

From: Peter Zijlstra
Date: Tue Jan 25 2011 - 13:04:45 EST


Hugh noted that we could still end up flushing the batch from atomic
context because we do tlb_remove_page() while holding the pte_lock.

This will still generate immense latencies, more so now than ever
before due to the larger batches. Break tlb_remove_page() into two
functions, one that queues the page and one that flushes the queue.

Leave the tlb_remove_page() interface for now with the old semantics
but add a might_sleep() in there to detect callers from atomic
contexts.

XXX should probably fold back into the mmu_gather preempt patches for
the various architectures.

Reported-by: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
arch/arm/include/asm/tlb.h | 17 ++++++++++++++++-
arch/ia64/include/asm/tlb.h | 22 ++++++++++++++++++----
arch/s390/include/asm/tlb.h | 18 ++++++++++++------
arch/sh/include/asm/tlb.h | 17 ++++++++++++++++-
arch/um/include/asm/tlb.h | 15 +++++++++++----
include/asm-generic/tlb.h | 22 +++++++++++++++-------
mm/memory.c | 14 +++++++++++---
7 files changed, 99 insertions(+), 26 deletions(-)

Index: linux-2.6/include/asm-generic/tlb.h
===================================================================
--- linux-2.6.orig/include/asm-generic/tlb.h
+++ linux-2.6/include/asm-generic/tlb.h
@@ -146,7 +146,7 @@ tlb_gather_mmu(struct mmu_gather *tlb, s
}

static inline void
-tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+tlb_flush_mmu(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;

@@ -176,7 +176,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
{
struct mmu_gather_batch *batch, *next;

- tlb_flush_mmu(tlb, start, end);
+ tlb_flush_mmu(tlb);

/* keep the page table cache within bounds */
check_pgt_cache();
@@ -193,7 +193,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
* handling the additional races in SMP caused by other CPUs caching valid
* mappings in their TLBs.
*/
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
struct mmu_gather_batch *batch;

@@ -201,17 +201,25 @@ static inline void tlb_remove_page(struc

if (tlb_fast_mode(tlb)) {
free_page_and_swap_cache(page);
- return;
+ return 0;
}

batch = tlb->active;
+ batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
- tlb_flush_mmu(tlb, 0, 0);
- batch = tlb->active;
+ return 1;
}

- batch->pages[batch->nr++] = page;
+ return 0;
+}
+
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ might_sleep();
+
+ if (__tlb_remove_page(tlb, page))
+ tlb_flush_mmu(tlb);
}

/**
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -990,11 +990,12 @@ static unsigned long zap_pte_range(struc
{
struct mm_struct *mm = tlb->mm;
int rss[NR_MM_COUNTERS];
+ int need_flush = 0;
spinlock_t *ptl;
pte_t *pte;

init_rss_vec(rss);
-
+again:
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
do {
@@ -1048,7 +1049,7 @@ static unsigned long zap_pte_range(struc
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- tlb_remove_page(tlb, page);
+ need_flush = __tlb_remove_page(tlb, page);
continue;
}
/*
@@ -1069,12 +1070,19 @@ static unsigned long zap_pte_range(struc
print_bad_pte(vma, addr, ptent, NULL);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, (addr != end && !need_flush));

add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

+ if (need_flush) {
+ need_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (addr != end)
+ goto again;
+ }
+
return addr;
}

Index: linux-2.6/arch/arm/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/arm/include/asm/tlb.h
+++ linux-2.6/arch/arm/include/asm/tlb.h
@@ -93,7 +93,22 @@ tlb_end_vma(struct mmu_gather *tlb, stru
flush_tlb_range(vma, tlb->range_start, tlb->range_end);
}

-#define tlb_remove_page(tlb,page) free_page_and_swap_cache(page)
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ free_page_and_swap_cache(page);
+ return 0;
+}
+
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ might_sleep();
+ __tlb_remove_page(tlb, page);
+}
+
+static inline void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+}
+
#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep)
#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)

Index: linux-2.6/arch/ia64/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/ia64/include/asm/tlb.h
+++ linux-2.6/arch/ia64/include/asm/tlb.h
@@ -204,14 +204,13 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
* must be delayed until after the TLB has been flushed (see comments at the beginning of
* this file).
*/
-static inline void
-tlb_remove_page (struct mmu_gather *tlb, struct page *page)
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
tlb->need_flush = 1;

if (tlb_fast_mode(tlb)) {
free_page_and_swap_cache(page);
- return;
+ return 0;
}

if (!tlb->nr && tlb->pages == tlb->local)
@@ -219,7 +218,22 @@ tlb_remove_page (struct mmu_gather *tlb,

tlb->pages[tlb->nr++] = page;
if (tlb->nr >= tlb->max)
- ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr);
+ return 1;
+
+ return 0;
+}
+
+static inline void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr);
+}
+
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ might_sleep();
+
+ if (__tlb_remove_page(tlb, page))
+ tlb_flush_mmu(tlb);
}

/*
Index: linux-2.6/arch/s390/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/s390/include/asm/tlb.h
+++ linux-2.6/arch/s390/include/asm/tlb.h
@@ -64,8 +64,7 @@ static inline void tlb_gather_mmu(struct
tlb->nr_pxds = tlb->max;
}

-static inline void tlb_flush_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
+static inline void tlb_flush_mmu(struct mmu_gather *tlb)
{
if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pxds < tlb->max))
__tlb_flush_mm(tlb->mm);
@@ -78,7 +77,7 @@ static inline void tlb_flush_mmu(struct
static inline void tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
{
- tlb_flush_mmu(tlb, start, end);
+ tlb_flush_mmu(tlb);

rcu_table_freelist_finish();

@@ -94,8 +93,15 @@ static inline void tlb_finish_mmu(struct
* tlb_ptep_clear_flush. In both flush modes the tlb fo a page cache page
* has already been freed, so just do free_page_and_swap_cache.
*/
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ free_page_and_swap_cache(page);
+ return 0;
+}
+
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
+ might_sleep();
free_page_and_swap_cache(page);
}

@@ -109,7 +115,7 @@ static inline void pte_free_tlb(struct m
if (!tlb->fullmm) {
tlb->array[tlb->nr_ptes++] = pte;
if (tlb->nr_ptes >= tlb->nr_pxds)
- tlb_flush_mmu(tlb, 0, 0);
+ tlb_flush_mmu(tlb);
} else
page_table_free(tlb->mm, (unsigned long *) pte);
}
@@ -130,7 +136,7 @@ static inline void pmd_free_tlb(struct m
if (!tlb->fullmm) {
tlb->array[--tlb->nr_pxds] = pmd;
if (tlb->nr_ptes >= tlb->nr_pxds)
- tlb_flush_mmu(tlb, 0, 0);
+ tlb_flush_mmu(tlb);
} else
crst_table_free(tlb->mm, (unsigned long *) pmd);
#endif
@@ -152,7 +158,7 @@ static inline void pud_free_tlb(struct m
if (!tlb->fullmm) {
tlb->array[--tlb->nr_pxds] = pud;
if (tlb->nr_ptes >= tlb->nr_pxds)
- tlb_flush_mmu(tlb, 0, 0);
+ tlb_flush_mmu(tlb);
} else
crst_table_free(tlb->mm, (unsigned long *) pud);
#endif
Index: linux-2.6/arch/sh/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/sh/include/asm/tlb.h
+++ linux-2.6/arch/sh/include/asm/tlb.h
@@ -83,7 +83,22 @@ tlb_end_vma(struct mmu_gather *tlb, stru
}
}

-#define tlb_remove_page(tlb,page) free_page_and_swap_cache(page)
+static inline void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+}
+
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ free_page_and_swap_cache(page);
+ return 0;
+}
+
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ might_sleep();
+ __tlb_remove_page(tlb, page);
+}
+
#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep)
#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)
#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp)
Index: linux-2.6/arch/um/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/um/include/asm/tlb.h
+++ linux-2.6/arch/um/include/asm/tlb.h
@@ -57,7 +57,7 @@ extern void flush_tlb_mm_range(struct mm
unsigned long end);

static inline void
-tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+tlb_flush_mmu(struct mmu_gather *tlb)
{
if (!tlb->need_flush)
return;
@@ -73,7 +73,7 @@ tlb_flush_mmu(struct mmu_gather *tlb, un
static inline void
tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
- tlb_flush_mmu(tlb, start, end);
+ tlb_flush_mmu(tlb);

/* keep the page table cache within bounds */
check_pgt_cache();
@@ -84,11 +84,18 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
* while handling the additional races in SMP caused by other CPUs
* caching valid mappings in their TLBs.
*/
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
tlb->need_flush = 1;
free_page_and_swap_cache(page);
- return;
+ return 0;
+}
+
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ might_sleep();
+
+ __tlb_remove_page(tlb, page);
}

/**


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/