RFC: Early freeing of page tables - 2.2.5 patch

Jakub Jelinek (jj@sunsite.ms.mff.cuni.cz)
Wed, 21 Apr 1999 18:34:37 +0200 (CEST)


Hi!

As I hope 2.3 time is getting close, I've started to code a clearly 2.3
thing: instead of the free_pgtable hack free always page tables as you go.
As we have the page table cache, those page tables will be ressurected from
there if needed, so usually no page clearing nor allocation/freeing will
actually happen.
I see two advantages with this:
a) this should speed up exit(), as clear_page_tables is gone.
clear_page_tables is very expensive, as it has to go through all the
page tables still hanging in the tree and read them fully into the core
and your caches are history.
b) it will give you more memory in case someone allocates some memory and
then munmaps it.

The patch tries to keep an unvariant: no page tables exist outside of the
current vm areas (chained from mm->mmap), unless it is only a part of the
page table and the other part overlaps with some vm area.
The freeing logic has been moved into zap_page_range, which now
accepts three more arguments: end address of the highest vm area below the
zapped region and start address of the lowest vm area above the zapped
region, plus flags.
It special cases IMHO quite common case when in do_mmap we're first
unmapping some range and then mapping some other range on top of it. In this
case, unless an error happens, page tables are not freed.

What do you think about this?
I haven't done yet proper measurements, just quickly tested lat_proc.static
so far and it went down from 280 to 230 usec. I'll try to do more
benchmarking later.
I'm interested in if this works for you on your architecture and if yes, how
do lmbench numbers look before and after the patch.

The patch is against 2.2.5, I'll update it soon to 2.2.6. It won't apply
cleanly to 2.2.6.

--- linux-2.2.5/arch/mips/kernel/sysirix.c.jj Fri Apr 9 16:12:59 1999
+++ linux-2.2.5/arch/mips/kernel/sysirix.c Wed Apr 21 10:30:30 1999
@@ -551,7 +551,7 @@ asmlinkage int irix_brk(unsigned long br
*/
if (brk <= current->mm->brk) {
mm->brk = brk;
- do_munmap(newbrk, oldbrk-newbrk);
+ do_munmap(newbrk, oldbrk-newbrk, DO_MUNMAP_CLEARPTE);
ret = 0;
goto out;
}
--- linux-2.2.5/arch/sparc/kernel/sys_sunos.c.jj Mon Oct 19 12:38:31 1998
+++ linux-2.2.5/arch/sparc/kernel/sys_sunos.c Wed Apr 21 10:30:57 1999
@@ -173,7 +173,7 @@ asmlinkage int sunos_brk(unsigned long b
*/
if (brk <= current->mm->brk) {
current->mm->brk = brk;
- do_munmap(newbrk, oldbrk-newbrk);
+ do_munmap(newbrk, oldbrk-newbrk, DO_MUNMAP_CLEARPTE);
goto out;
}
/*
--- linux-2.2.5/arch/sparc64/kernel/sys_sunos32.c.jj Wed Nov 4 09:19:24 1998
+++ linux-2.2.5/arch/sparc64/kernel/sys_sunos32.c Wed Apr 21 10:31:21 1999
@@ -147,7 +147,7 @@ asmlinkage int sunos_brk(u32 baddr)
/* Always allow shrinking brk. */
if (brk <= current->mm->brk) {
current->mm->brk = brk;
- do_munmap(newbrk, oldbrk-newbrk);
+ do_munmap(newbrk, oldbrk-newbrk, DO_MUNMAP_CLEARPTE);
goto out;
}
/* Check against rlimit and stack.. */
--- linux-2.2.5/drivers/char/mem.c.jj Fri Mar 12 08:33:54 1999
+++ linux-2.2.5/drivers/char/mem.c Mon Apr 19 20:01:29 1999
@@ -312,7 +312,7 @@ static inline size_t read_zero_pagealign
count = size;

flush_cache_range(mm, addr, addr + count);
- zap_page_range(mm, addr, count);
+ zap_page_range(mm, addr, count, addr, addr + count, 0);
zeromap_page_range(addr, count, PAGE_COPY);
flush_tlb_range(mm, addr, addr + count);

--- linux-2.2.5/include/asm-alpha/pgtable.h.jj Thu Jan 7 20:25:50 1999
+++ linux-2.2.5/include/asm-alpha/pgtable.h Mon Apr 19 14:03:30 1999
@@ -187,6 +187,13 @@ extern void flush_tlb_range(struct mm_st

#endif /* __SMP__ */

+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ /* Although Alpha uses VPTE caches, this can be a nop, as Alpha does
+ not have finegrained tlb flushing, so it will flush VPTE stuff
+ during next flush_tlb_range. */
+}
+
/* Certain architectures need to do special things when PTEs
* within a page table are directly modified. Thus, the following
* hook is made available.
--- linux-2.2.5/include/asm-arm/pgtable.h.jj Thu Jan 7 20:25:53 1999
+++ linux-2.2.5/include/asm-arm/pgtable.h Mon Apr 19 14:01:41 1999
@@ -9,6 +9,10 @@

extern int do_check_pgt_cache(int, int);

+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+}
+
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
#define PageSkip(page) (0)
#define kern_addr_valid(addr) (1)
--- linux-2.2.5/include/asm-i386/pgtable.h.jj Thu Jan 7 20:26:30 1999
+++ linux-2.2.5/include/asm-i386/pgtable.h Mon Apr 19 14:00:46 1999
@@ -74,6 +74,12 @@ static inline void flush_tlb_range(struc
__flush_tlb();
}

+extern inline void flush_tlb_pgtables(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ /* ia32 has hw page tables. */
+}
+
#else

/*
--- linux-2.2.5/include/asm-m68k/pgtable.h.jj Tue Feb 2 11:48:02 1999
+++ linux-2.2.5/include/asm-m68k/pgtable.h Mon Apr 19 14:00:02 1999
@@ -235,6 +235,10 @@ extern inline void flush_tlb_kernel_page
__asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr));
}

+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+}
+
/* Certain architectures need to do special things when pte's
* within a page table are directly modified. Thus, the following
* hook is made available.
--- linux-2.2.5/include/asm-mips/pgtable.h.jj Thu Jan 7 20:33:50 1999
+++ linux-2.2.5/include/asm-mips/pgtable.h Mon Apr 19 13:49:55 1999
@@ -47,6 +47,10 @@ extern void (*flush_tlb_range)(struct mm
unsigned long end);
extern void (*flush_tlb_page)(struct vm_area_struct *vma, unsigned long page);

+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+}
+
/*
* - add_wired_entry() add a fixed TLB entry, and move wired register
*/
--- linux-2.2.5/include/asm-ppc/pgtable.h.jj Fri Apr 9 16:24:32 1999
+++ linux-2.2.5/include/asm-ppc/pgtable.h Mon Apr 19 13:49:21 1999
@@ -20,6 +20,11 @@ extern void local_flush_tlb_range(struct
#define flush_tlb_page local_flush_tlb_page
#define flush_tlb_range local_flush_tlb_range

+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ /* PPC has hw page tables. */
+}
+
/*
* No cache flushing is required when address mappings are
* changed, because the caches on PowerPCs are physically
--- linux-2.2.5/include/asm-sparc/pgtable.h.jj Thu Jan 7 20:27:43 1999
+++ linux-2.2.5/include/asm-sparc/pgtable.h Mon Apr 19 13:40:31 1999
@@ -472,6 +472,11 @@ BTFIXUPDEF_CALL(void, flush_tlb_page, st
#define flush_tlb_range(mm,start,end) BTFIXUP_CALL(flush_tlb_range)(mm,start,end)
#define flush_tlb_page(vma,addr) BTFIXUP_CALL(flush_tlb_page)(vma,addr)

+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ /* Sparc32 has hw page tables. */
+}
+
BTFIXUPDEF_CALL(void, flush_page_to_ram, unsigned long)
BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long)

--- linux-2.2.5/include/asm-sparc64/pgtable.h.jj Fri Apr 9 16:47:06 1999
+++ linux-2.2.5/include/asm-sparc64/pgtable.h Wed Apr 21 10:47:51 1999
@@ -248,6 +248,17 @@ extern __inline__ void flush_tlb_page(st

#endif

+/* This will change for Cheetah and later chips. */
+#define VPTE_BASE 0xfffffffe00000000
+
+extern __inline__ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ flush_tlb_range(mm,
+ VPTE_BASE + (((long)start) >> (PAGE_SHIFT - 3)),
+ VPTE_BASE + (((long)end) >> (PAGE_SHIFT - 3)));
+}
+
#define mk_pte(page, pgprot) (__pte(__pa(page) | pgprot_val(pgprot)))
#define mk_pte_phys(physpage, pgprot) (__pte((physpage) | pgprot_val(pgprot)))
#define pte_modify(_pte, newprot) \
--- linux-2.2.5/include/linux/mm.h.jj Fri Apr 9 16:46:52 1999
+++ linux-2.2.5/include/linux/mm.h Wed Apr 21 10:29:26 1999
@@ -285,7 +285,10 @@ extern void free_page_tables(struct mm_s
extern void clear_page_tables(struct mm_struct *, unsigned long, int);
extern int new_page_tables(struct task_struct * tsk);

-extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
+#define ZAP_RANGE_CLEARPTE 1
+#define ZAP_RANGE_FLUSHTLB 2
+extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size,
+ unsigned long prevend, unsigned long nextstart, int flags);
extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
@@ -313,7 +316,8 @@ extern unsigned long get_unmapped_area(u

extern unsigned long do_mmap(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
-extern int do_munmap(unsigned long, size_t);
+#define DO_MUNMAP_CLEARPTE (ZAP_RANGE_CLEARPTE|ZAP_RANGE_FLUSHTLB)
+extern int do_munmap(unsigned long, size_t, int);

/* filemap.c */
extern void remove_inode_page(struct page *);
@@ -370,6 +374,7 @@ static inline int expand_stack(struct vm

/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev);

/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
--- linux-2.2.5/ipc/shm.c.jj Sat Jan 2 23:01:36 1999
+++ linux-2.2.5/ipc/shm.c Wed Apr 21 10:32:07 1999
@@ -400,7 +400,7 @@ static int shm_map (struct vm_area_struc
int error;

/* clear old mappings */
- do_munmap(shmd->vm_start, shmd->vm_end - shmd->vm_start);
+ do_munmap(shmd->vm_start, shmd->vm_end - shmd->vm_start, DO_MUNMAP_CLEARPTE);

/* add new mapping */
tmp = shmd->vm_end - shmd->vm_start;
@@ -607,7 +607,7 @@ asmlinkage int sys_shmdt (char *shmaddr)
shmdnext = shmd->vm_next;
if (shmd->vm_ops == &shm_vm_ops
&& shmd->vm_start - shmd->vm_offset == (ulong) shmaddr)
- do_munmap(shmd->vm_start, shmd->vm_end - shmd->vm_start);
+ do_munmap(shmd->vm_start, shmd->vm_end - shmd->vm_start, DO_MUNMAP_CLEARPTE);
}
unlock_kernel();
up(&current->mm->mmap_sem);
--- linux-2.2.5/mm/memory.c.jj Tue Feb 2 11:49:17 1999
+++ linux-2.2.5/mm/memory.c Wed Apr 21 18:19:58 1999
@@ -71,45 +71,6 @@ void oom(struct task_struct * task)
force_sig(SIGKILL, task);
}

-/*
- * Note: this doesn't free the actual pages themselves. That
- * has been handled earlier when unmapping all the memory regions.
- */
-static inline void free_one_pmd(pmd_t * dir)
-{
- pte_t * pte;
-
- if (pmd_none(*dir))
- return;
- if (pmd_bad(*dir)) {
- printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
- pmd_clear(dir);
- return;
- }
- pte = pte_offset(dir, 0);
- pmd_clear(dir);
- pte_free(pte);
-}
-
-static inline void free_one_pgd(pgd_t * dir)
-{
- int j;
- pmd_t * pmd;
-
- if (pgd_none(*dir))
- return;
- if (pgd_bad(*dir)) {
- printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
- pgd_clear(dir);
- return;
- }
- pmd = pmd_offset(dir, 0);
- pgd_clear(dir);
- for (j = 0; j < PTRS_PER_PMD ; j++)
- free_one_pmd(pmd+j);
- pmd_free(pmd);
-}
-
/* Low and high watermarks for page table cache.
The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
*/
@@ -121,27 +82,6 @@ int check_pgt_cache(void)
return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
}

-
-/*
- * This function clears all user-level page tables of a process - this
- * is needed by execve(), so that old pages aren't in the way.
- */
-void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
-{
- pgd_t * page_dir = mm->pgd;
-
- if (page_dir && page_dir != swapper_pg_dir) {
- page_dir += first;
- do {
- free_one_pgd(page_dir);
- page_dir++;
- } while (--nr);
-
- /* keep the page table cache within bounds */
- check_pgt_cache();
- }
-}
-
/*
* This function just free's the page directory - the
* pages tables themselves have been freed earlier by
@@ -322,81 +262,114 @@ static inline void forget_pte(pte_t page
}
}

-static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
+/*
+ * remove user pages in a given range.
+ *
+ * 21Apr99 Merged into one routine from several inline routines to reduce
+ * variable count and make things faster.
+ * Implement early pgtable freeing. -jj
+ */
+void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size,
+ unsigned long prevend, unsigned long nextstart, int flags)
{
+ pgd_t * pgd;
+ pmd_t * pmd;
pte_t * pte;
- int freed;
+ unsigned long end = address + size;
+ int freed = 0;

- if (pmd_none(*pmd))
- return 0;
- if (pmd_bad(*pmd)) {
- printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
- pmd_clear(pmd);
- return 0;
- }
- pte = pte_offset(pmd, address);
- address &= ~PMD_MASK;
- if (address + size > PMD_SIZE)
- size = PMD_SIZE - address;
- size >>= PAGE_SHIFT;
- freed = 0;
+ pgd = pgd_offset(mm, address)-1;
+
for (;;) {
- pte_t page;
- if (!size)
- break;
- page = *pte;
- pte++;
- size--;
- if (pte_none(page))
+ pgd++;
+
+ /* zap_pmd_range */
+ if (pgd_none(*pgd))
+ goto skip_zap_pmd_range;
+ if (pgd_bad(*pgd)) {
+ printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
+ pgd_clear(pgd);
+skip_zap_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ if (address >= end)
+ goto out2;
continue;
- pte_clear(pte-1);
- freed += free_pte(page);
- }
- return freed;
-}
+ }

-static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
-{
- pmd_t * pmd;
- unsigned long end;
- int freed;
+ pmd = pmd_offset(pgd, address);
+
+ do {
+ /* zap_pte_range */
+ if (pmd_none(*pmd))
+ goto skip_zap_pte_range;
+ if (pmd_bad(*pmd)) {
+ printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
+ pmd_clear(pmd);
+skip_zap_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
+ if (address >= end) {
+ if ((flags & ZAP_RANGE_CLEARPTE) &&
+ ((address - 1) & PGDIR_MASK) >= prevend &&
+ ((end + PGDIR_SIZE - 1) & PGDIR_MASK) <= nextstart) {
+ pgd_clear(pgd);
+ pmd_free((pmd_t *)((unsigned long)pmd & ~PMD_TABLE_MASK));
+ }
+ goto out2;
+ }
+ goto cont_zap_pmd_range;
+ }

- if (pgd_none(*dir))
- return 0;
- if (pgd_bad(*dir)) {
- printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
- pgd_clear(dir);
- return 0;
- }
- pmd = pmd_offset(dir, address);
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- freed = 0;
- do {
- freed += zap_pte_range(pmd, address, end - address);
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address < end);
- return freed;
-}
+ pte = pte_offset(pmd, address);

-/*
- * remove user pages in a given range.
- */
-void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
-{
- pgd_t * dir;
- unsigned long end = address + size;
- int freed = 0;
+ do {
+ pte_t page = *pte;

- dir = pgd_offset(mm, address);
- while (address < end) {
- freed += zap_pmd_range(dir, address, end - address);
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- dir++;
+ if (pte_none(page))
+ goto cont_zap_pte_range;
+ pte_clear(pte);
+ freed += free_pte(page);
+
+cont_zap_pte_range: address += PAGE_SIZE;
+ if (address >= end)
+ goto out;
+ pte++;
+ } while ((unsigned long)pte & PTE_TABLE_MASK);
+ if (address - PMD_SIZE >= prevend &&
+ (flags & ZAP_RANGE_CLEARPTE)) {
+ pmd_clear(pmd);
+ pte_free((pte_t *)((unsigned long)pte - (PTRS_PER_PTE * sizeof(pte_t))));
+ }
+
+cont_zap_pmd_range: pmd++;
+ } while ((unsigned long)pmd & PMD_TABLE_MASK);
+ if (address - PGDIR_SIZE >= prevend &&
+ (flags & ZAP_RANGE_CLEARPTE)) {
+ pgd_clear(pgd);
+ pmd_free((pmd_t *)((unsigned long)pmd - (PTRS_PER_PMD * sizeof(pmd_t))));
+ }
}
+
+out: if (flags & ZAP_RANGE_CLEARPTE) {
+ if (((address - 1) & PMD_MASK) >= prevend &&
+ ((end + PMD_SIZE - 1) & PMD_MASK) <= nextstart) {
+ pmd_clear(pmd);
+ pte_free((pte_t *)((unsigned long)pte & ~PTE_TABLE_MASK));
+ }
+ if (((address - 1) & PGDIR_MASK) >= prevend &&
+ ((end + PGDIR_SIZE - 1) & PGDIR_MASK) <= nextstart) {
+ pgd_clear(pgd);
+ pmd_free((pmd_t *)((unsigned long)pmd & ~PMD_TABLE_MASK));
+ }
+ }
+out2: if (flags & ZAP_RANGE_FLUSHTLB) {
+ address = (end - size) & PMD_MASK;
+ if (address < prevend)
+ address += PMD_SIZE;
+ end = (end + PMD_SIZE - 1) & PMD_MASK;
+ if (end > nextstart)
+ end -= PMD_SIZE;
+ if (address < end)
+ flush_tlb_pgtables(mm, address, end);
+ }
+
/*
* Update rss for the mm_struct (not necessarily current->mm)
*/
@@ -750,7 +723,8 @@ void vmtruncate(struct inode * inode, un
/* mapping wholly truncated? */
if (mpnt->vm_offset >= offset) {
flush_cache_range(mm, start, end);
- zap_page_range(mm, start, len);
+ zap_page_range(mm, start, len,
+ start, start + len, 0);
flush_tlb_range(mm, start, end);
continue;
}
@@ -766,7 +740,8 @@ void vmtruncate(struct inode * inode, un
start = (start + ~PAGE_MASK) & PAGE_MASK;
}
flush_cache_range(mm, start, end);
- zap_page_range(mm, start, len);
+ zap_page_range(mm, start, len,
+ start, start + len, 0);
flush_tlb_range(mm, start, end);
} while ((mpnt = mpnt->vm_next_share) != NULL);
}
--- linux-2.2.5/mm/mmap.c.jj Fri Mar 12 08:38:38 1999
+++ linux-2.2.5/mm/mmap.c Wed Apr 21 18:20:21 1999
@@ -115,7 +115,7 @@ asmlinkage unsigned long sys_brk(unsigne

/* Always allow shrinking brk. */
if (brk <= mm->brk) {
- if (!do_munmap(newbrk, oldbrk-newbrk))
+ if (!do_munmap(newbrk, oldbrk-newbrk, DO_MUNMAP_CLEARPTE))
goto set_brk;
goto out;
}
@@ -175,7 +175,6 @@ unsigned long do_mmap(struct file * file
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
int error;
-
if ((len = PAGE_ALIGN(len)) == 0)
return addr;

@@ -284,26 +283,26 @@ unsigned long do_mmap(struct file * file

/* Clear old maps */
error = -ENOMEM;
- if (do_munmap(addr, len))
+ if (do_munmap(addr, len, 0))
goto free_vma;

/* Check against address space limit. */
if ((mm->total_vm << PAGE_SHIFT) + len
> current->rlim[RLIMIT_AS].rlim_cur)
- goto free_vma;
+ goto unmap_and_free_vma;

/* Private writable mapping? Check memory availability.. */
if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
!(flags & MAP_NORESERVE) &&
!vm_enough_memory(len >> PAGE_SHIFT))
- goto free_vma;
+ goto unmap_and_free_vma;

if (file) {
int correct_wcount = 0;
if (vma->vm_flags & VM_DENYWRITE) {
if (file->f_dentry->d_inode->i_writecount > 0) {
error = -ETXTBSY;
- goto free_vma;
+ goto unmap_and_free_vma;
}
/* f_op->mmap might possibly sleep
* (generic_file_mmap doesn't, but other code
@@ -328,7 +327,7 @@ unsigned long do_mmap(struct file * file
* after the call. Save the values we need now ...
*/
flags = vma->vm_flags;
- addr = vma->vm_start; /* can addr have changed?? */
+ addr = vma->vm_start; /* can addr have changed?? Yes. -jj */
insert_vm_struct(mm, vma);
merge_segments(mm, vma->vm_start, vma->vm_end);

@@ -340,10 +339,30 @@ unsigned long do_mmap(struct file * file
return addr;

unmap_and_free_vma:
- /* Undo any partial mapping done by a device driver. */
- flush_cache_range(mm, vma->vm_start, vma->vm_end);
- zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
- flush_tlb_range(mm, vma->vm_start, vma->vm_end);
+ /* Undo any partial mapping done by a device driver.
+ * If we are past successfull do_munmap, then we have to
+ * free the page tables associated with it, as for the
+ * common case of remapping a new area over an old
+ * area we don't clear the page tables. -jj */
+ {
+ struct vm_area_struct * prev, * next;
+ unsigned long prevend, nextstart;
+ unsigned long start, end;
+
+ start = vma->vm_start;
+ if (addr < start) start = addr;
+ end = vma->vm_end;
+ if (addr + len < end) end = addr + len;
+
+ next = find_vma_prev(mm, vma->vm_start, &prev);
+ prevend = prev ? prev->vm_end : 0;
+ nextstart = next ? next->vm_start : USER_PTRS_PER_PGD * PGDIR_SIZE;
+
+ flush_cache_range(mm, start, end);
+ zap_page_range(mm, start, end - start, prevend, nextstart,
+ ZAP_RANGE_CLEARPTE|ZAP_RANGE_FLUSHTLB);
+ flush_tlb_range(mm, start, end);
+ }
free_vma:
kmem_cache_free(vm_area_cachep, vma);
return error;
@@ -467,130 +486,25 @@ struct vm_area_struct * find_vma_prev(st
return NULL;
}

-/* Normal function to fix up a mapping
- * This function is the default for when an area has no specific
- * function. This may be used as part of a more specific routine.
- * This function works out what part of an area is affected and
- * adjusts the mapping information. Since the actual page
- * manipulation is done in do_mmap(), none need be done here,
- * though it would probably be more appropriate.
- *
- * By the time this function is called, the area struct has been
- * removed from the process mapping list, so it needs to be
- * reinserted if necessary.
- *
- * The 4 main cases are:
- * Unmapping the whole area
- * Unmapping from the start of the segment to a point in it
- * Unmapping from an intermediate point to the end
- * Unmapping between to intermediate points, making a hole.
- *
- * Case 4 involves the creation of 2 new areas, for each side of
- * the hole. If possible, we reuse the existing area rather than
- * allocate a new one, and the return indicates whether the old
- * area was reused.
- */
-static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
- size_t len, struct vm_area_struct **extra)
-{
- struct vm_area_struct *mpnt;
- unsigned long end = addr + len;
-
- area->vm_mm->total_vm -= len >> PAGE_SHIFT;
- if (area->vm_flags & VM_LOCKED)
- area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
-
- /* Unmapping the whole area. */
- if (addr == area->vm_start && end == area->vm_end) {
- if (area->vm_ops && area->vm_ops->close)
- area->vm_ops->close(area);
- if (area->vm_file)
- fput(area->vm_file);
- return 0;
- }
-
- /* Work out to one of the ends. */
- if (end == area->vm_end)
- area->vm_end = addr;
- else if (addr == area->vm_start) {
- area->vm_offset += (end - area->vm_start);
- area->vm_start = end;
- } else {
- /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
- /* Add end mapping -- leave beginning for below */
- mpnt = *extra;
- *extra = NULL;
-
- mpnt->vm_mm = area->vm_mm;
- mpnt->vm_start = end;
- mpnt->vm_end = area->vm_end;
- mpnt->vm_page_prot = area->vm_page_prot;
- mpnt->vm_flags = area->vm_flags;
- mpnt->vm_ops = area->vm_ops;
- mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
- mpnt->vm_file = area->vm_file;
- mpnt->vm_pte = area->vm_pte;
- if (mpnt->vm_file)
- mpnt->vm_file->f_count++;
- if (mpnt->vm_ops && mpnt->vm_ops->open)
- mpnt->vm_ops->open(mpnt);
- area->vm_end = addr; /* Truncate area */
- insert_vm_struct(current->mm, mpnt);
- }
-
- insert_vm_struct(current->mm, area);
- return 1;
-}
-
-/*
- * Try to free as many page directory entries as we can,
- * without having to work very hard at actually scanning
- * the page tables themselves.
- *
- * Right now we try to free page tables if we have a nice
- * PGDIR-aligned area that got free'd up. We could be more
- * granular if we want to, but this is fast and simple,
- * and covers the bad cases.
- *
- * "prev", if it exists, points to a vma before the one
- * we just free'd - but there's no telling how much before.
- */
-static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
- unsigned long start, unsigned long end)
-{
- unsigned long first = start & PGDIR_MASK;
- unsigned long last = (end + PGDIR_SIZE - 1) & PGDIR_MASK;
-
- if (!prev) {
- prev = mm->mmap;
- if (!prev)
- goto no_mmaps;
- if (prev->vm_end > start) {
- if (last > prev->vm_end)
- last = prev->vm_end;
- goto no_mmaps;
- }
- }
- for (;;) {
- struct vm_area_struct *next = prev->vm_next;
-
- if (next) {
- if (next->vm_start < start) {
- prev = next;
- continue;
- }
- if (last > next->vm_start)
- last = next->vm_start;
- }
- if (prev->vm_end > first)
- first = prev->vm_end + PGDIR_SIZE - 1;
- break;
- }
-no_mmaps:
- first = first >> PGDIR_SHIFT;
- last = last >> PGDIR_SHIFT;
- if (last > first)
- clear_page_tables(mm, first, last-first);
+static void unmap_hole_fixup(struct vm_area_struct *mpnt,
+ struct vm_area_struct *extra,
+ unsigned long addr, unsigned long end)
+{
+ extra->vm_mm = mpnt->vm_mm;
+ extra->vm_start = end;
+ extra->vm_end = mpnt->vm_end;
+ extra->vm_page_prot = mpnt->vm_page_prot;
+ extra->vm_flags = mpnt->vm_flags;
+ extra->vm_ops = mpnt->vm_ops;
+ extra->vm_offset = mpnt->vm_offset + (end - mpnt->vm_start);
+ extra->vm_file = mpnt->vm_file;
+ extra->vm_pte = mpnt->vm_pte;
+ if (extra->vm_file)
+ extra->vm_file->f_count++;
+ if (extra->vm_ops && extra->vm_ops->open)
+ extra->vm_ops->open(extra);
+ mpnt->vm_end = addr; /* Truncate area */
+ insert_vm_struct(current->mm, extra);
}

/* Munmap is split into 2 main parts -- this part which finds
@@ -598,10 +512,11 @@ no_mmaps:
* work. This now handles partial unmappings.
* Jeremy Fitzhardine <jeremy@sw.oz.au>
*/
-int do_munmap(unsigned long addr, size_t len)
+int do_munmap(unsigned long addr, size_t len, int flags)
{
struct mm_struct * mm;
struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
+ unsigned long prevend, nextstart;

if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
return -EINVAL;
@@ -637,6 +552,12 @@ int do_munmap(unsigned long addr, size_t
return -ENOMEM;

npp = (prev ? &prev->vm_next : &mm->mmap);
+
+ if (mpnt->vm_start < addr)
+ prevend = addr;
+ else
+ prevend = prev ? prev->vm_end : 0;
+
free = NULL;
for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
*npp = mpnt->vm_next;
@@ -645,6 +566,8 @@ int do_munmap(unsigned long addr, size_t
if (mm->mmap_avl)
avl_remove(mpnt, &mm->mmap_avl);
}
+
+ nextstart = mpnt ? mpnt->vm_start : USER_PTRS_PER_PGD * PGDIR_SIZE;

/* Ok - we have the memory areas we should free on the 'free' list,
* so release them, and unmap the page range..
@@ -653,6 +576,7 @@ int do_munmap(unsigned long addr, size_t
*/
while ((mpnt = free) != NULL) {
unsigned long st, end, size;
+ unsigned long nextst;

free = free->vm_next;

@@ -660,30 +584,67 @@ int do_munmap(unsigned long addr, size_t
end = addr+len;
end = end > mpnt->vm_end ? mpnt->vm_end : end;
size = end - st;
+ nextst = (free != NULL || end != mpnt->vm_end) ?
+ end : nextstart;

if (mpnt->vm_ops && mpnt->vm_ops->unmap)
mpnt->vm_ops->unmap(mpnt, st, size);

remove_shared_vm_struct(mpnt);
mm->map_count--;
-
+
flush_cache_range(mm, st, end);
- zap_page_range(mm, st, size);
+ zap_page_range(mm, st, size, prevend, nextst, flags);
flush_tlb_range(mm, st, end);

/*
* Fix the mapping, and free the old area if it wasn't reused.
+ * The 4 main cases are:
+ * Unmapping the whole area
+ * Unmapping from the start of the segment to a point in it
+ * Unmapping from an intermediate point to the end
+ * Unmapping between to intermediate points, making a hole.
+ *
+ * Case 4 involves the creation of 2 new areas, for each side of
+ * the hole. If possible, we reuse the existing area rather than
+ * allocate a new one.
+ *
+ * We handle the common cases in this routine and let the
+ * case 4 be handled out of line.
*/
- if (!unmap_fixup(mpnt, st, size, &extra))
+
+ mpnt->vm_mm->total_vm -= size >> PAGE_SHIFT;
+ if (mpnt->vm_flags & VM_LOCKED)
+ mpnt->vm_mm->locked_vm -= size >> PAGE_SHIFT;
+
+ /* Unmapping the whole area. */
+ if (st == mpnt->vm_start && end == mpnt->vm_end) {
+ if (mpnt->vm_ops && mpnt->vm_ops->close)
+ mpnt->vm_ops->close(mpnt);
+ if (mpnt->vm_file)
+ fput(mpnt->vm_file);
kmem_cache_free(vm_area_cachep, mpnt);
+ } else {
+ /* Work out to one of the ends. */
+ if (end == mpnt->vm_end)
+ mpnt->vm_end = st;
+ else if (st == mpnt->vm_start) {
+ mpnt->vm_offset += (end - mpnt->vm_start);
+ mpnt->vm_start = end;
+ } else {
+ /* Unmapping a hole: mpnt->vm_start < st <= end < mpnt->vm_end */
+ /* Add end mapping -- leave beginning for below */
+ unmap_hole_fixup(mpnt, extra, st, end);
+ extra = NULL;
+ }
+ insert_vm_struct(current->mm, mpnt);
+ }
}

/* Release the extra vma struct if it wasn't used */
if (extra)
kmem_cache_free(vm_area_cachep, extra);

- free_pgtables(mm, prev, addr, addr+len);
-
mm->mmap_cache = NULL; /* Kill the cache. */
return 0;
}
@@ -694,7 +655,7 @@ asmlinkage int sys_munmap(unsigned long

down(&current->mm->mmap_sem);
lock_kernel();
- ret = do_munmap(addr, len);
+ ret = do_munmap(addr, len, DO_MUNMAP_CLEARPTE);
unlock_kernel();
up(&current->mm->mmap_sem);
return ret;
@@ -734,7 +695,9 @@ void exit_mmap(struct mm_struct * mm)
}
mm->map_count--;
remove_shared_vm_struct(mpnt);
- zap_page_range(mm, start, size);
+ zap_page_range(mm, start, size, 0, next ? next->vm_start :
+ USER_PTRS_PER_PGD * PGDIR_SIZE,
+ ZAP_RANGE_CLEARPTE);
if (mpnt->vm_file)
fput(mpnt->vm_file);
kmem_cache_free(vm_area_cachep, mpnt);
@@ -744,8 +707,6 @@ void exit_mmap(struct mm_struct * mm)
/* This is just debugging */
if (mm->map_count)
printk("exit_mmap: map count is %d\n", mm->map_count);
-
- clear_page_tables(mm, 0, USER_PTRS_PER_PGD);
}

/* Insert vm structure into process list sorted by address
--- linux-2.2.5/mm/mremap.c.jj Tue Dec 1 00:44:57 1998
+++ linux-2.2.5/mm/mremap.c Wed Apr 21 10:44:42 1999
@@ -88,6 +88,8 @@ static int move_page_tables(struct mm_st
unsigned long new_addr, unsigned long old_addr, unsigned long len)
{
unsigned long offset = len;
+ unsigned long prevend, nextstart;
+ struct vm_area_struct * vma, * prev;

flush_cache_range(mm, old_addr, old_addr + len);
flush_tlb_range(mm, old_addr, old_addr + len);
@@ -112,15 +114,20 @@ static int move_page_tables(struct mm_st
* the old page tables)
*/
oops_we_failed:
+ vma = find_vma_prev(mm, new_addr, &prev);
+ prevend = prev ? prev->vm_end : 0;
+ nextstart = vma ? vma->vm_start : USER_PTRS_PER_PGD * PGDIR_SIZE;
+
flush_cache_range(mm, new_addr, new_addr + len);
while ((offset += PAGE_SIZE) < len)
move_one_page(mm, new_addr + offset, old_addr + offset);
- zap_page_range(mm, new_addr, new_addr + len);
+ zap_page_range(mm, new_addr, new_addr + len, prevend, nextstart,
+ ZAP_RANGE_CLEARPTE|ZAP_RANGE_FLUSHTLB);
flush_tlb_range(mm, new_addr, new_addr + len);
return -1;
}

-static inline unsigned long move_vma(struct vm_area_struct * vma,
+static inline unsigned long move_vma(struct vm_area_struct * vma,
unsigned long addr, unsigned long old_len, unsigned long new_len)
{
struct vm_area_struct * new_vma;
@@ -140,7 +147,7 @@ static inline unsigned long move_vma(str
new_vma->vm_ops->open(new_vma);
insert_vm_struct(current->mm, new_vma);
merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
- do_munmap(addr, old_len);
+ do_munmap(addr, old_len, DO_MUNMAP_CLEARPTE);
current->mm->total_vm += new_len >> PAGE_SHIFT;
if (new_vma->vm_flags & VM_LOCKED) {
current->mm->locked_vm += new_len >> PAGE_SHIFT;
@@ -178,7 +185,7 @@ asmlinkage unsigned long sys_mremap(unsi
*/
ret = addr;
if (old_len >= new_len) {
- do_munmap(addr+new_len, old_len - new_len);
+ do_munmap(addr+new_len, old_len - new_len, DO_MUNMAP_CLEARPTE);
goto out;
}

Cheers,
Jakub
___________________________________________________________________
Jakub Jelinek | jj@sunsite.mff.cuni.cz | http://sunsite.mff.cuni.cz
Administrator of SunSITE Czech Republic, MFF, Charles University
___________________________________________________________________
UltraLinux | http://ultra.linux.cz/ | http://ultra.penguin.cz/
Linux version 2.2.5 on a sparc64 machine (3958.37 BogoMips)
___________________________________________________________________

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/