Re: [PATCH RFC] vm_unmap_aliases: allow callers to inhibit TLB flush
From: Nick Piggin
Date: Mon Feb 23 2009 - 04:14:44 EST
On Monday 23 February 2009 18:30:14 Jeremy Fitzhardinge wrote:
> Nick Piggin wrote:
> > On Friday 20 February 2009 06:11:32 Jeremy Fitzhardinge wrote:
> >> Nick Piggin wrote:
> >>> Then what is the point of the vm_unmap_aliases? If you are doing it
> >>> for security it won't work because other CPUs might still be able
> >>> to write through dangling TLBs. If you are not doing it for
> >>> security then it does not need to be done at all.
> >>
> >> Xen will make sure any danging tlb entries are flushed before handing
> >> the page out to anyone else.
> >>
> >>> Unless it is something strange that Xen does with the page table
> >>> structure and you just need to get rid of those?
> >>
> >> Yeah. A pte pointing at a page holds a reference on it, saying that it
> >> belongs to the domain. You can't return it to Xen until the refcount is
> >> 0.
> >
> > OK. Then I will remember to find some time to get the interrupt
> > safe patches working. I wonder why you can't just return it to
> > Xen when (or have Xen hold it somewhere until) the refcount
> > reaches 0?
>
> It would still need to allocate a page in the meantime, which could fail
> because the domain has hit its hard memory limit (which will be the
> common case, because a domain generally starts with its full compliment
> of memory). The nice thing about the exchange is that there's no
> accounting to take into account.
OK, well I don't really understand the details but I trust you if
you say it's hard :)
> >>> Or... what if we just allow a compile and/or boot time flag to direct
> >>> that it does not want lazy vmap unmapping and it will just revert to
> >>> synchronous unmapping? If Xen needs lots of flushing anyway it might
> >>> not be a win anyway.
> >>
> >> That may be worth considering.
> >
> > ... in the meantime, shall we just do this for Xen? It is probably
> > safer and may end up with no worse performance on Xen anyway. If
> > we get more vmap users and it becomes important, you could look at
> > more sophisticated ways of doing this. Eg. a page could be flagged
> > if it potentially has lazy vmaps.
>
> OK. Do you want to do the patch, or shall I?
Here's a start for you. I think it gets rid of all the dead code and
data without introducing any actual conditional compilation...
---
mm/vmalloc.c | 66 ++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 48 insertions(+), 18 deletions(-)
Index: linux-2.6/mm/vmalloc.c
===================================================================
--- linux-2.6.orig/mm/vmalloc.c
+++ linux-2.6/mm/vmalloc.c
@@ -29,6 +29,11 @@
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
+#ifdef CONFIG_VMAP_NO_LAZY_FLUSH
+#define VMAP_LAZY_FLUSHES 0
+#else
+#define VMAP_LAZY_FLUSHES 1
+#endif
/*** Page table manipulation functions ***/
@@ -376,7 +381,7 @@ retry:
found:
if (addr + size > vend) {
spin_unlock(&vmap_area_lock);
- if (!purged) {
+ if (VMAP_LAZY_FLUSHES && !purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
@@ -413,7 +418,10 @@ static void __free_vmap_area(struct vmap
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
- call_rcu(&va->rcu_head, rcu_free_va);
+ if (VMAP_LAZY_FLUSHES)
+ call_rcu(&va->rcu_head, rcu_free_va);
+ else
+ kfree(va);
}
/*
@@ -450,8 +458,10 @@ static void vmap_debug_free_range(unsign
* faster).
*/
#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
+ if (VMAP_LAZY_FLUSHES) {
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+ }
#endif
}
@@ -571,10 +581,16 @@ static void purge_vmap_area_lazy(void)
*/
static void free_unmap_vmap_area_noflush(struct vmap_area *va)
{
- va->flags |= VM_LAZY_FREE;
- atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
- if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ if (VMAP_LAZY_FLUSHES) {
+ va->flags |= VM_LAZY_FREE;
+ atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
+ if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+ try_purge_vmap_area_lazy();
+ } else {
+ vunmap_page_range(va->va_start, va->va_end);
+ flush_tlb_kernel_range(va->va_start, va->va_end);
+ }
}
/*
@@ -610,6 +626,15 @@ static void free_unmap_vmap_area_addr(un
/*** Per cpu kva allocator ***/
/*
+ * This does lazy flushing as well, so don't call it if the arch doesn't want
+ * lazy vmap kva flushes... The scalability aspect should be less important
+ * in that case anyway seeing as kernel tlb flushing tends not to be scalable.
+ * It would be possible to make this work without lazy tlb flushing if it
+ * was really a big deal.
+ */
+
+
+/*
* vmap space is limited especially on 32 bit architectures. Ensure there is
* room for at least 16 percpu vmap blocks per CPU.
*/
@@ -877,6 +902,9 @@ void vm_unmap_aliases(void)
int cpu;
int flush = 0;
+ if (!VMAP_LAZY_FLUSHES)
+ return;
+
if (unlikely(!vmap_initialized))
return;
@@ -937,7 +965,7 @@ void vm_unmap_ram(const void *mem, unsig
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
- if (likely(count <= VMAP_MAX_ALLOC))
+ if (VMAP_LAZY_FLUSHES && likely(count <= VMAP_MAX_ALLOC))
vb_free(mem, size);
else
free_unmap_vmap_area_addr(addr);
@@ -959,7 +987,7 @@ void *vm_map_ram(struct page **pages, un
unsigned long addr;
void *mem;
- if (likely(count <= VMAP_MAX_ALLOC)) {
+ if (VMAP_LAZY_FLUSHES && likely(count <= VMAP_MAX_ALLOC)) {
mem = vb_alloc(size, GFP_KERNEL);
if (IS_ERR(mem))
return NULL;
@@ -988,14 +1016,16 @@ void __init vmalloc_init(void)
struct vm_struct *tmp;
int i;
- for_each_possible_cpu(i) {
- struct vmap_block_queue *vbq;
-
- vbq = &per_cpu(vmap_block_queue, i);
- spin_lock_init(&vbq->lock);
- INIT_LIST_HEAD(&vbq->free);
- INIT_LIST_HEAD(&vbq->dirty);
- vbq->nr_dirty = 0;
+ if (VMAP_LAZY_FLUSHES) {
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ INIT_LIST_HEAD(&vbq->dirty);
+ vbq->nr_dirty = 0;
+ }
}
/* Import existing vmlist entries. */
¢éì®&Þ~º&¶¬+-±éÝ¥w®Ë±Êâmébìdz¹Þ)í
æèw*jg¬±¨¶Ýj/êäz¹Þà2Þ¨èÚ&¢)ß«a¶Úþø®G«éh®æj:+v¨wèÙ>W±êÞiÛaxPjØm¶ÿÃ-»+ùd_