This patch eliminates about 35% of the raw rmap setup/teardown overhead by
adopting a new locking interface that allows the add_rmaps to be batched in
copy_page_range. This is work-in-progress. I expect to show a further 35%
overhead reduction shortly, by batching the remove_rmaps as well. Further
gains will come more slowly, but I hope that an immediate 70% reduction in
overhead gets us into the doesn't-suck-too-much range, and we can move on
to other benchmarks.
This patch is against 2.4.19-pre7+rmap13b. I'll forward port to 2.5 in due
course, however this should allow you to verify my results.
Here is the script I used, essentially the same as the one you originally
posted, but all in one piece:
---------------------------
#!/bin/sh
doit()
{
( cat $1 | wc -l )
}
doitlots()
{
count=0
while (( count<500 ))
do
doit foo >/dev/null
count=$(expr $count + 1)
done
echo done
}
echo hello >foobar
rm -f foocount
echo >foocount
count=0
while (( count<10 ))
do
doitlots foobar >>foocount &
let count++
done
count=0
while (( count<10 ))
do
count=$(cat foocount | wc -l)
done
---------------------------
--- 2.4.19-pre7.clean/include/linux/mm.h Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/include/linux/mm.h Fri Aug 2 17:45:04 2002
@@ -131,8 +131,10 @@
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
};
-/* forward declaration; pte_chain is meant to be internal to rmap.c */
-struct pte_chain;
+struct pte_chain {
+ struct pte_chain * next;
+ pte_t * ptep;
+};
/*
* Each physical page in the system has a struct page associated with
@@ -324,29 +326,40 @@
#define PageLaunder(page) test_bit(PG_launder, &(page)->flags)
#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags)
-/*
- * inlines for acquisition and release of PG_chainlock
- */
-static inline void pte_chain_lock(struct page *page)
+#define num_rmap_locks (1 << 8)
+
+extern spinlock_t rmap_locks[num_rmap_locks];
+
+void init_rmap_locks(void);
+
+static inline unsigned long rmap_locknum(unsigned long index)
{
- /*
- * Assuming the lock is uncontended, this never enters
- * the body of the outer loop. If it is contended, then
- * within the inner loop a non-atomic test is used to
- * busywait with less bus contention for a good time to
- * attempt to acquire the lock bit.
- */
- while (test_and_set_bit(PG_chainlock, &page->flags)) {
- while (test_bit(PG_chainlock, &page->flags))
- cpu_relax();
- }
+ return (index >> 4) & (num_rmap_locks - 1);
}
-static inline void pte_chain_unlock(struct page *page)
+static inline spinlock_t *lock_rmap(struct page *page)
{
- clear_bit(PG_chainlock, &page->flags);
+ unsigned long index = page->index;
+ while (1) {
+ spinlock_t *lock = rmap_locks + rmap_locknum(index);
+ spin_lock(lock);
+ if (index == page->index)
+ return lock;
+ spin_unlock(lock);
+ }
}
+static inline void set_page_index(struct page *page, unsigned long index)
+{
+ spinlock_t *lock = lock_rmap(page);
+ page->index = index;
+ spin_unlock(lock);
+}
+
+struct pte_chain *pte_chain_alloc(zone_t *zone);
+void pte_chain_push(zone_t *zone, struct pte_chain *pte_chain);
+void add_rmap_nolock(struct page* page, pte_t *ptep, struct pte_chain *pte_chain);
+
/*
* The zone field is never updated after free_area_init_core()
* sets it, so none of the operations on it need to be atomic.
@@ -519,7 +532,7 @@
extern int shmem_zero_setup(struct vm_area_struct *);
extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
-extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma, unsigned *locknum);
extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
--- 2.4.19-pre7.clean/kernel/fork.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/kernel/fork.c Fri Aug 2 16:25:22 2002
@@ -132,6 +132,7 @@
{
struct vm_area_struct * mpnt, *tmp, **pprev;
int retval;
+ unsigned rmap_locknum = -1;
flush_cache_mm(current->mm);
mm->locked_vm = 0;
@@ -191,7 +192,7 @@
*pprev = tmp;
pprev = &tmp->vm_next;
mm->map_count++;
- retval = copy_page_range(mm, current->mm, tmp);
+ retval = copy_page_range(mm, current->mm, tmp, &rmap_locknum);
spin_unlock(&mm->page_table_lock);
if (tmp->vm_ops && tmp->vm_ops->open)
--- 2.4.19-pre7.clean/mm/bootmem.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/bootmem.c Fri Aug 2 16:25:22 2002
@@ -61,6 +61,8 @@
*/
memset(bdata->node_bootmem_map, 0xff, mapsize);
+ init_rmap_locks(); // is there a better place for this?
+
return mapsize;
}
--- 2.4.19-pre7.clean/mm/filemap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/filemap.c Fri Aug 2 16:25:22 2002
@@ -635,7 +635,7 @@
if (!PageLocked(page))
BUG();
- page->index = index;
+ set_page_index(page, index);
page_cache_get(page);
spin_lock(&pagecache_lock);
add_page_to_inode_queue(mapping, page);
@@ -658,7 +658,7 @@
flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
page->flags = flags | (1 << PG_locked);
page_cache_get(page);
- page->index = offset;
+ set_page_index(page, offset);
add_page_to_inode_queue(mapping, page);
add_page_to_hash_queue(page, hash);
}
--- 2.4.19-pre7.clean/mm/memory.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/memory.c Fri Aug 2 17:48:29 2002
@@ -176,13 +176,17 @@
* dst->page_table_lock is held on entry and exit,
* but may be dropped within pmd_alloc() and pte_alloc().
*/
+struct pte_chain *pte_chain_alloc(zone_t *zone);
+
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, unsigned *unused_locknum)
{
pgd_t * src_pgd, * dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ zone_t *pte_chain_zone = zone_table[ZONE_NORMAL];
+ struct pte_chain *local_pte_chain = NULL, *pte_chain;
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
@@ -212,6 +216,8 @@
do {
pte_t * src_pte, * dst_pte;
+ unsigned last_locknum = -1;
+ spinlock_t *rmap_lock = NULL;
/* copy_pte_range */
@@ -247,6 +253,28 @@
goto cont_copy_pte_range_noset;
}
ptepage = pte_page(pte);
+
+ if (!local_pte_chain) {
+ unsigned more = 16;
+ if (last_locknum != -1) {
+ spin_unlock(rmap_lock);
+ last_locknum = -1;
+ }
+ while (more--) {
+ struct pte_chain *new = pte_chain_alloc(pte_chain_zone);
+ new->next = local_pte_chain;
+ local_pte_chain = new;
+ }
+ }
+
+ if (last_locknum != rmap_locknum(ptepage->index)) {
+ if (last_locknum != -1) {
+
+ spin_unlock(rmap_lock);
+ }
+ rmap_lock = lock_rmap(ptepage);
+ last_locknum = rmap_locknum(ptepage->index);
+ }
if ((!VALID_PAGE(ptepage)) ||
PageReserved(ptepage))
goto cont_copy_pte_range;
@@ -265,15 +293,24 @@
dst->rss++;
cont_copy_pte_range: set_pte(dst_pte, pte);
- page_add_rmap(ptepage, dst_pte);
+ pte_chain = local_pte_chain;
+ local_pte_chain = local_pte_chain->next;
+ add_rmap_nolock(ptepage, dst_pte, pte_chain);
+
cont_copy_pte_range_noset: address += PAGE_SIZE;
- if (address >= end)
+ if (address >= end) {
+ if (last_locknum != -1)
+ spin_unlock(rmap_lock);
goto out_unlock;
+ }
src_pte++;
dst_pte++;
} while ((unsigned long)src_pte & PTE_TABLE_MASK);
spin_unlock(&src->page_table_lock);
-
+
+ if (last_locknum != -1)
+ spin_unlock(rmap_lock);
+
cont_copy_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
@@ -281,6 +318,13 @@
out_unlock:
spin_unlock(&src->page_table_lock);
out:
+ spin_lock(&pte_chain_zone->pte_chain_freelist_lock);
+ while (local_pte_chain) {
+ struct pte_chain *next = local_pte_chain->next;
+ pte_chain_push(pte_chain_zone, local_pte_chain);
+ local_pte_chain = next;
+ }
+ spin_unlock(&pte_chain_zone->pte_chain_freelist_lock);
return 0;
nomem:
return -ENOMEM;
@@ -1518,3 +1562,4 @@
}
return page;
}
+
--- 2.4.19-pre7.clean/mm/page_alloc.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/page_alloc.c Fri Aug 2 17:49:36 2002
@@ -213,6 +213,7 @@
if (curr != head) {
unsigned int index;
+ static unsigned foo_page_allocs;
page = memlist_entry(curr, struct page, list);
if (BAD_RANGE(zone,page))
@@ -227,6 +228,7 @@
spin_unlock_irqrestore(&zone->lock, flags);
set_page_count(page, 1);
+ page->index = foo_page_allocs++ >> PAGE_CACHE_SHIFT;
if (BAD_RANGE(zone,page))
BUG();
DEBUG_LRU_PAGE(page);
--- 2.4.19-pre7.clean/mm/rmap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/rmap.c Fri Aug 2 17:33:51 2002
@@ -43,16 +43,20 @@
* in systems with long-lived applications the relative overhead of
* exit() will be lower since the applications are long-lived.
*/
-struct pte_chain {
- struct pte_chain * next;
- pte_t * ptep;
-};
-static inline struct pte_chain * pte_chain_alloc(zone_t *);
+spinlock_t rmap_locks[num_rmap_locks];
+
static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
struct page *, zone_t *);
static void alloc_new_pte_chains(zone_t *);
+void init_rmap_locks(void)
+{
+ int i = 0;
+ while (i < num_rmap_locks)
+ spin_lock_init(rmap_locks + i++);
+}
+
/**
* page_referenced - test if the page was referenced
* @page: the page to test
@@ -86,9 +90,10 @@
* Add a new pte reverse mapping to a page.
* The caller needs to hold the mm->page_table_lock.
*/
-void page_add_rmap(struct page * page, pte_t * ptep)
+void page_add_rmap(struct page *page, pte_t *ptep)
{
- struct pte_chain * pte_chain;
+ struct pte_chain *pte_chain;
+ spinlock_t *rmap_lock;
#ifdef DEBUG_RMAP
if (!page || !ptep)
@@ -103,7 +108,7 @@
return;
#ifdef DEBUG_RMAP
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
{
struct pte_chain * pc;
for (pc = page->pte_chain; pc; pc = pc->next) {
@@ -111,19 +116,48 @@
BUG();
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
#endif
pte_chain = pte_chain_alloc(page_zone(page));
-
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
/* Hook up the pte_chain to the page. */
pte_chain->ptep = ptep;
pte_chain->next = page->pte_chain;
page->pte_chain = pte_chain;
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
+}
+
+void add_rmap_nolock(struct page* page, pte_t *ptep, struct pte_chain *pte_chain)
+{
+#ifdef DEBUG_RMAP
+ if (!page || !ptep)
+ BUG();
+ if (!pte_present(*ptep))
+ BUG();
+ if (!ptep_to_mm(ptep));
+ BUG();
+#endif
+
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+#ifdef DEBUG_RMAP
+ {
+ struct pte_chain * pc;
+ for (pc = page->pte_chain; pc; pc = pc->next) {
+ if (pc->ptep == ptep)
+ BUG();
+ }
+ }
+#endif
+
+ /* Hook up the pte_chain to the page. */
+ pte_chain->ptep = ptep;
+ pte_chain->next = page->pte_chain;
+ page->pte_chain = pte_chain;
}
/**
@@ -140,6 +174,7 @@
{
struct pte_chain * pc, * prev_pc = NULL;
zone_t *zone;
+ spinlock_t *rmap_lock;
if (!page || !ptep)
BUG();
@@ -148,7 +183,7 @@
zone = page_zone(page);
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
if (pc->ptep == ptep) {
pte_chain_free(pc, prev_pc, page, zone);
@@ -166,7 +201,7 @@
#endif
out:
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
return;
}
@@ -335,8 +370,7 @@
** functions.
**/
-static inline void pte_chain_push(zone_t * zone,
- struct pte_chain * pte_chain)
+void pte_chain_push(zone_t *zone, struct pte_chain *pte_chain)
{
pte_chain->ptep = NULL;
pte_chain->next = zone->pte_chain_freelist;
@@ -388,7 +422,7 @@
* pte_chain structures as required.
* Caller needs to hold the page's pte_chain_lock.
*/
-static inline struct pte_chain * pte_chain_alloc(zone_t * zone)
+struct pte_chain *pte_chain_alloc(zone_t *zone)
{
struct pte_chain * pte_chain;
--- 2.4.19-pre7.clean/mm/swap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/swap.c Fri Aug 2 17:35:47 2002
@@ -78,6 +78,8 @@
*/
void drop_page(struct page * page)
{
+ spinlock_t *rmap_lock;
+
if (!TryLockPage(page)) {
if (page->mapping && page->buffers) {
page_cache_get(page);
@@ -90,7 +92,7 @@
}
/* Make sure the page really is reclaimable. */
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (!page->mapping || PageDirty(page) || page->pte_chain ||
page->buffers || page_count(page) > 1)
deactivate_page_nolock(page);
@@ -106,7 +108,7 @@
add_page_to_inactive_clean_list(page);
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
}
/*
--- 2.4.19-pre7.clean/mm/vmscan.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/vmscan.c Fri Aug 2 17:33:46 2002
@@ -81,6 +81,7 @@
struct list_head * page_lru;
swp_entry_t entry = {0};
int maxscan;
+ spinlock_t *rmap_lock;
/*
* We need to hold the pagecache_lock around all tests to make sure
@@ -109,13 +110,13 @@
}
/* Page cannot be reclaimed ? Move to inactive_dirty list. */
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (unlikely(page->pte_chain || page->buffers ||
PageReferenced(page) || PageDirty(page) ||
page_count(page) > 1 || TryLockPage(page))) {
del_page_from_inactive_clean_list(page);
add_page_to_inactive_dirty_list(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
continue;
}
@@ -140,7 +141,7 @@
printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
list_del(page_lru);
zone->inactive_clean_pages--;
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
}
spin_unlock(&pagecache_lock);
@@ -150,7 +151,7 @@
found_page:
__lru_cache_del(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
spin_unlock(&pagecache_lock);
spin_unlock(&pagemap_lru_lock);
if (entry.val)
@@ -213,6 +214,7 @@
{
int maxscan, cleaned_pages, target;
struct list_head * entry;
+ spinlock_t *rmap_lock;
target = free_plenty(zone);
cleaned_pages = 0;
@@ -268,17 +270,17 @@
* The page is in active use or really unfreeable. Move to
* the active list and adjust the page age if needed.
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (page_referenced(page) && page_mapping_inuse(page) &&
!page_over_rsslimit(page)) {
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
page->age = max((int)page->age, PAGE_AGE_START);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
continue;
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
/*
* Anonymous process memory without backing store. Try to
@@ -286,10 +288,10 @@
*
* XXX: implement swap clustering ?
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (page->pte_chain && !page->mapping && !page->buffers) {
page_cache_get(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
spin_unlock(&pagemap_lru_lock);
if (!add_to_swap(page)) {
activate_page(page);
@@ -300,7 +302,7 @@
}
page_cache_release(page);
spin_lock(&pagemap_lru_lock);
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
}
/*
@@ -313,14 +315,14 @@
case SWAP_FAIL:
goto page_active;
case SWAP_AGAIN:
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
continue;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
if (PageDirty(page) && page->mapping) {
/*
@@ -407,12 +409,12 @@
* This test is not safe from races, but only the one
* in reclaim_page() needs to be.
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (page->mapping && !PageDirty(page) && !page->pte_chain &&
page_count(page) == 1) {
del_page_from_inactive_dirty_list(page);
add_page_to_inactive_clean_list(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
cleaned_pages++;
} else {
@@ -424,7 +426,7 @@
page_active:
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
}
}
@@ -476,6 +478,7 @@
struct list_head * page_lru;
int nr_deactivated = 0;
struct page * page;
+ spinlock_t *rmap_lock;
/* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
@@ -505,9 +508,9 @@
* From here until the end of the current iteration
* both PG_locked and the pte_chain_lock are held.
*/
- pte_chain_lock(page);
+ rmap_lock = lock_rmap(page);
if (!page_mapping_inuse(page)) {
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
drop_page(page);
continue;
@@ -533,12 +536,12 @@
} else {
deactivate_page_nolock(page);
if (++nr_deactivated > target) {
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
goto done;
}
}
- pte_chain_unlock(page);
+ spin_unlock(rmap_lock);
UnlockPage(page);
/* Low latency reschedule point */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Wed Aug 07 2002 - 22:00:20 EST