Re: page fault scalability patch final : i386 tested, x86_64 supportadded

From: Christoph Lameter
Date: Fri Aug 27 2004 - 22:37:15 EST


So I think the move to atomic for rss acceptable?

The patch follows against 2.6.9-rc1-mm1 (the earlier post was against
2.6.9-rc1).

Also here are some numbers (64GB allocations) on a 512 CPU system with
500GB of memory. The patch roughly doubles the performance at a high
number of CPUs:

Gb Rep Threads User System Wall flt/cpu/s fault/wsec
64 3 1 2.940s 308.449s 311.024s 40408.876 40427.880
64 3 2 3.146s 347.536s 181.031s 35881.179 69397.783
64 3 4 3.104s 356.892s 119.032s 34952.800 105447.468
64 3 8 3.009s 334.970s 63.070s 37229.683 197513.752
64 3 16 3.150s 419.547s 44.016s 29768.126 284913.189
64 3 32 4.471s 713.090s 40.088s 17535.647 307728.613
64 3 64 9.967s 1777.146s 45.027s 7040.911 277906.805
64 3 128 23.077s 5925.452s 63.064s 2115.298 197709.241
64 3 256 27.838s 3185.475s 34.005s 3915.867 369527.746
64 3 512 25.287s 1500.589s 16.078s 8246.349 749845.256

Without the patch:

Gb Rep Threads User System Wall flt/cpu/s fault/wsec
64 3 1 2.906s 306.330s 309.027s 40690.257 40684.877
64 3 2 3.162s 406.475s 211.037s 30717.157 59528.843
64 3 4 2.988s 468.129s 143.018s 26708.653 87878.947
64 3 8 3.096s 643.947s 95.081s 19446.756 131318.325
64 3 16 3.635s 1211.389s 96.014s 10356.093 130875.623
64 3 32 20.920s 1277.679s 98.037s 9689.596 127908.850
64 3 64 66.025s 1327.081s 104.032s 9032.264 120615.748
64 3 128 285.900s 1302.198s 114.060s 7923.252 109790.867
64 3 256 307.222s 672.339s 69.094s 12845.447 179906.645
64 3 512 255.730s 318.023s 40.042s 21930.841 311235.794

Signed-off-by: Christoph Lameter <clameter@xxxxxxx>

Index: linux-2.6.9-rc1/kernel/fork.c
===================================================================
--- linux-2.6.9-rc1.orig/kernel/fork.c 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/kernel/fork.c 2004-08-27 19:10:34.000000000 -0700
@@ -302,7 +302,7 @@
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->map_count = 0;
- mm->rss = 0;
+ atomic_set(&mm->mm_rss, 0);
cpus_clear(mm->cpu_vm_mask);
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
Index: linux-2.6.9-rc1/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc1.orig/include/linux/sched.h 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/include/linux/sched.h 2004-08-27 19:12:01.000000000 -0700
@@ -213,9 +213,10 @@
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
+ atomic_t mm_rss; /* Number of pages used by this mm struct */
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
- spinlock_t page_table_lock; /* Protects task page tables and mm->rss */
+ spinlock_t page_table_lock; /* Protects task page tables */

struct list_head mmlist; /* List of all active mm's. These are globally strung
* together off init_mm.mmlist, and are protected
@@ -225,7 +226,7 @@
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long rlimit_rss, rss, total_vm, locked_vm, shared_vm;
+ unsigned long rlimit_rss, total_vm, locked_vm, shared_vm;
unsigned long exec_vm, stack_vm, reserved_vm, def_flags;

unsigned long saved_auxv[40]; /* for /proc/PID/auxv */
Index: linux-2.6.9-rc1/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/proc/task_mmu.c 2004-08-27 19:09:54.000000000 -0700
+++ linux-2.6.9-rc1/fs/proc/task_mmu.c 2004-08-27 19:31:28.000000000 -0700
@@ -21,7 +21,7 @@
"VmLib:\t%8lu kB\n",
(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
- mm->rss << (PAGE_SHIFT-10),
+ (unsigned long) atomic_read(&mm->mm_rss) << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib);
return buffer;
@@ -38,7 +38,7 @@
*shared = mm->shared_vm;
*text = (mm->end_code - mm->start_code) >> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm - *text;
- *resident = mm->rss;
+ *resident = atomic_read(&mm->mm_rss);
return mm->total_vm;
}

Index: linux-2.6.9-rc1/mm/mmap.c
===================================================================
--- linux-2.6.9-rc1.orig/mm/mmap.c 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/mm/mmap.c 2004-08-27 19:10:34.000000000 -0700
@@ -1843,7 +1843,7 @@
vma = mm->mmap;
mm->mmap = mm->mmap_cache = NULL;
mm->mm_rb = RB_ROOT;
- mm->rss = 0;
+ atomic_set(&mm->mm_rss, 0);
mm->total_vm = 0;
mm->locked_vm = 0;

Index: linux-2.6.9-rc1/include/asm-generic/tlb.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-generic/tlb.h 2004-08-24 00:01:50.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-generic/tlb.h 2004-08-27 19:10:34.000000000 -0700
@@ -88,11 +88,11 @@
{
int freed = tlb->freed;
struct mm_struct *mm = tlb->mm;
- int rss = mm->rss;
+ int rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_set(&mm->mm_rss, rss - freed);
tlb_flush_mmu(tlb, start, end);

/* keep the page table cache within bounds */
Index: linux-2.6.9-rc1/fs/binfmt_flat.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/binfmt_flat.c 2004-08-24 00:01:50.000000000 -0700
+++ linux-2.6.9-rc1/fs/binfmt_flat.c 2004-08-27 19:10:34.000000000 -0700
@@ -650,7 +650,7 @@
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
- current->mm->rss = 0;
+ atomic_set(current->mm->mm_rss, 0);
}

if (flags & FLAT_FLAG_KTRACE)
Index: linux-2.6.9-rc1/fs/exec.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/exec.c 2004-08-27 19:09:53.000000000 -0700
+++ linux-2.6.9-rc1/fs/exec.c 2004-08-27 19:10:34.000000000 -0700
@@ -320,7 +320,7 @@
pte_unmap(pte);
goto out;
}
- mm->rss++;
+ atomic_inc(&mm->mm_rss);
lru_cache_add_active(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
Index: linux-2.6.9-rc1/mm/memory.c
===================================================================
--- linux-2.6.9-rc1.orig/mm/memory.c 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/mm/memory.c 2004-08-27 19:10:34.000000000 -0700
@@ -158,9 +158,7 @@
if (!pmd_present(*pmd)) {
struct page *new;

- spin_unlock(&mm->page_table_lock);
new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
if (!new)
return NULL;

@@ -172,8 +170,11 @@
pte_free(new);
goto out;
}
+ if (!pmd_test_and_populate(mm, pmd, new)) {
+ pte_free(new);
+ goto out;
+ }
inc_page_state(nr_page_table_pages);
- pmd_populate(mm, pmd, new);
}
out:
return pte_offset_map(pmd, address);
@@ -325,7 +326,7 @@
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
get_page(page);
- dst->rss++;
+ atomic_inc(&dst->mm_rss);
set_pte(dst_pte, pte);
page_dup_rmap(page);
cont_copy_pte_range_noset:
@@ -1117,7 +1118,7 @@
page_table = pte_offset_map(pmd, address);
if (likely(pte_same(*page_table, pte))) {
if (PageReserved(old_page))
- ++mm->rss;
+ atomic_inc(&mm->mm_rss);
else
page_remove_rmap(old_page);
break_cow(vma, new_page, address, page_table);
@@ -1335,8 +1336,7 @@
}

/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We hold the mm semaphore
*/
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
@@ -1348,15 +1348,13 @@
int ret = VM_FAULT_MINOR;

pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1399,7 +1397,7 @@
if (vm_swap_full())
remove_exclusive_swap_page(page);

- mm->rss++;
+ atomic_inc(&mm->mm_rss);
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1427,14 +1425,12 @@
}

/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
+ * We are called with the MM semaphore held.
*/
static int
do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+ unsigned long addr, pte_t orig_entry)
{
pte_t entry;
struct page * page = ZERO_PAGE(addr);
@@ -1446,7 +1442,6 @@
if (write_access) {
/* Allocate our own private page. */
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
@@ -1455,30 +1450,40 @@
goto no_mem;
clear_user_highpage(page, addr);

- spin_lock(&mm->page_table_lock);
+ lock_page(page);
page_table = pte_offset_map(pmd, addr);

- if (!pte_none(*page_table)) {
- pte_unmap(page_table);
- page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
- goto out;
- }
- mm->rss++;
entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
vma->vm_page_prot)),
vma);
- lru_cache_add_active(page);
mark_page_accessed(page);
- page_add_anon_rmap(page, vma, addr);
}

- set_pte(page_table, entry);
+ /* update the entry */
+ if (!ptep_cmpxchg(page_table, orig_entry, entry))
+ {
+ if (write_access) {
+ pte_unmap(page_table);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ goto out;
+ }
+ if (write_access) {
+ /*
+ * The following two functions are safe to use without
+ * the page_table_lock but do they need to come before
+ * the cmpxchg?
+ */
+ lru_cache_add_active(page);
+ page_add_anon_rmap(page, vma, addr);
+ atomic_inc(&mm->mm_rss);
+ unlock_page(page);
+ }
pte_unmap(page_table);

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
- spin_unlock(&mm->page_table_lock);
out:
return VM_FAULT_MINOR;
no_mem:
@@ -1494,12 +1499,12 @@
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * This is called with the MM semaphore held.
*/
static int
do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+ unsigned long address, int write_access, pte_t *page_table,
+ pmd_t *pmd, pte_t orig_entry)
{
struct page * new_page;
struct address_space *mapping = NULL;
@@ -1510,9 +1515,8 @@

if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
+ pmd, write_access, address, orig_entry);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);

if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1573,7 +1577,7 @@
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
if (!PageReserved(new_page))
- ++mm->rss;
+ atomic_inc(&mm->mm_rss);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
@@ -1610,7 +1614,7 @@
* nonlinear vmas.
*/
static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+ unsigned long address, int write_access, pte_t *pte, pmd_t *pmd, pte_t entry)
{
unsigned long pgoff;
int err;
@@ -1623,13 +1627,12 @@
if (!vma->vm_ops || !vma->vm_ops->populate ||
(write_access && !(vma->vm_flags & VM_SHARED))) {
pte_clear(pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
}

pgoff = pte_to_pgoff(*pte);

pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);

err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
@@ -1648,49 +1651,54 @@
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
+ * Note that kswapd only ever _removes_ pages, never adds them.
+ * We need to insure to handle that case properly.
+ *
* The adding of pages is protected by the MM semaphore (which we hold),
* so we don't need to worry about a page being suddenly been added into
* our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t *pte, pmd_t *pmd)
{
pte_t entry;
+ pte_t new_entry;

entry = *pte;
if (!pte_present(entry)) {
/*
* If it truly wasn't present, we know that kswapd
* and the PTE updates will not touch it later. So
- * drop the lock.
+ * no need to acquire the page_table_lock.
*/
if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
+ return do_file_page(mm, vma, address, write_access, pte, pmd, entry);
return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
}

+ /*
+ * This is the case in which we may only update some bits in the pte.
+ *
+ * The following statement was removed
+ * ptep_set_access_flags(vma, address, pte, new_entry, write_access);
+ * Not sure if all the side effects are replicated here for all platforms.
+ *
+ */
+ new_entry = pte_mkyoung(entry);
if (write_access) {
- if (!pte_write(entry))
+ if (!pte_write(entry)) {
+ /* do_wp_page expects us to hold the page_table_lock */
+ spin_lock(&mm->page_table_lock);
return do_wp_page(mm, vma, address, pte, pmd, entry);
-
- entry = pte_mkdirty(entry);
+ }
+ new_entry = pte_mkdirty(new_entry);
}
- entry = pte_mkyoung(entry);
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
+ if (ptep_cmpxchg(pte, entry, new_entry))
+ update_mmu_cache(vma, address, new_entry);
pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
return VM_FAULT_MINOR;
}

@@ -1708,30 +1716,27 @@

inc_page_state(pgfault);

- if (is_vm_hugetlb_page(vma))
+ if (unlikely(is_vm_hugetlb_page(vma)))
return VM_FAULT_SIGBUS; /* mapping truncation does this. */

/*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
+ * We rely on the mmap_sem and the SMP-safe atomic PTE updates.
+ * to synchronize with kswapd
*/
- spin_lock(&mm->page_table_lock);
pmd = pmd_alloc(mm, pgd, address);

- if (pmd) {
+ if (likely(pmd)) {
pte_t * pte = pte_alloc_map(mm, pmd, address);
- if (pte)
+ if (likely(pte))
return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
}
- spin_unlock(&mm->page_table_lock);
return VM_FAULT_OOM;
}

/*
* Allocate page middle directory.
*
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*
* On a two-level page table, this ends up actually being entirely
* optimized away.
@@ -1740,9 +1745,7 @@
{
pmd_t *new;

- spin_unlock(&mm->page_table_lock);
new = pmd_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
if (!new)
return NULL;

@@ -1754,7 +1757,11 @@
pmd_free(new);
goto out;
}
- pgd_populate(mm, pgd, new);
+ /* Insure that the update is done in an atomic way */
+ if (!pgd_test_and_populate(mm, pgd, new)) {
+ pmd_free(new);
+ goto out;
+ }
out:
return pmd_offset(pgd, address);
}
Index: linux-2.6.9-rc1/include/asm-ia64/pgalloc.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-ia64/pgalloc.h 2004-08-24 00:01:52.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-ia64/pgalloc.h 2004-08-27 19:10:34.000000000 -0700
@@ -34,6 +34,10 @@
#define pmd_quicklist (local_cpu_data->pmd_quick)
#define pgtable_cache_size (local_cpu_data->pgtable_cache_sz)

+/* Empty entries of PMD and PGD */
+#define PMD_NONE 0
+#define PTE_NONE 0
+
static inline pgd_t*
pgd_alloc_one_fast (struct mm_struct *mm)
{
@@ -84,6 +88,11 @@
pgd_val(*pgd_entry) = __pa(pmd);
}

+static inline int
+pgd_test_and_populate (struct mm_struct *mm, pgd_t *pgd_entry, pmd_t *pmd)
+{
+ return ia64_cmpxchg8_acq(pgd_entry,__pa(pmd), PMD_NONE) == PMD_NONE;
+}

static inline pmd_t*
pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
@@ -132,6 +141,12 @@
pmd_val(*pmd_entry) = page_to_phys(pte);
}

+static inline int
+pmd_test_and_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
+{
+ return ia64_cmpxchg8_acq(pmd_entry, page_to_phys(pte), PTE_NONE) == PTE_NONE;
+}
+
static inline void
pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
{
Index: linux-2.6.9-rc1/include/asm-i386/pgtable.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-i386/pgtable.h 2004-08-27 19:09:56.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-i386/pgtable.h 2004-08-27 19:10:34.000000000 -0700
@@ -409,9 +409,11 @@
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define __HAVE_ARCH_PTEP_XCHG
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define __HAVE_ARCH_PTEP_MKDIRTY
#define __HAVE_ARCH_PTE_SAME
+#define __HAVE_ARCH_PTEP_CMPXCHG
#include <asm-generic/pgtable.h>

#endif /* _I386_PGTABLE_H */
Index: linux-2.6.9-rc1/include/asm-i386/pgtable-3level.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-i386/pgtable-3level.h 2004-08-27 19:09:56.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-i386/pgtable-3level.h 2004-08-27 19:10:34.000000000 -0700
@@ -6,7 +6,8 @@
* tables on PPro+ CPUs.
*
* Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx>
- */
+ * August 26, 2004 added ptep_cmpxchg and ptep_xchg <christoph@xxxxxxxxxxx>
+*/

#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
@@ -88,6 +89,26 @@
return res;
}

+static inline pte_t ptep_xchg(pte_t *ptep, pte_t newval)
+{
+ pte_t res;
+
+ /* xchg acts as a barrier before the setting of the high bits.
+ * (But we also have a cmpxchg8b. Why not use that? (cl))
+ */
+ res.pte_low = xchg(&ptep->pte_low, newval.pte_low);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = newval.pte_high;
+
+ return res;
+}
+
+
+static inline int ptep_cmpxchg(pte_t *ptep, pte_t oldval, pte_t newval)
+{
+ return cmpxchg(ptep, pte_val(oldval), pte_val(newval)) == pte_val(oldval);
+}
+
static inline int pte_same(pte_t a, pte_t b)
{
return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
Index: linux-2.6.9-rc1/fs/binfmt_som.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/binfmt_som.c 2004-08-24 00:02:26.000000000 -0700
+++ linux-2.6.9-rc1/fs/binfmt_som.c 2004-08-27 19:10:34.000000000 -0700
@@ -259,7 +259,7 @@
create_som_tables(bprm);

current->mm->start_stack = bprm->p;
- current->mm->rss = 0;
+ atomic_set(current->mm->mm_rss, 0);

#if 0
printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
Index: linux-2.6.9-rc1/mm/fremap.c
===================================================================
--- linux-2.6.9-rc1.orig/mm/fremap.c 2004-08-24 00:01:51.000000000 -0700
+++ linux-2.6.9-rc1/mm/fremap.c 2004-08-27 19:10:34.000000000 -0700
@@ -38,7 +38,7 @@
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
- mm->rss--;
+ atomic_dec(&mm->mm_rss);
}
}
} else {
@@ -86,7 +86,7 @@

zap_pte(mm, vma, addr, pte);

- mm->rss++;
+ atomic_inc(&mm->mm_rss);
flush_icache_page(vma, page);
set_pte(pte, mk_pte(page, prot));
page_add_file_rmap(page);
Index: linux-2.6.9-rc1/mm/swapfile.c
===================================================================
--- linux-2.6.9-rc1.orig/mm/swapfile.c 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/mm/swapfile.c 2004-08-27 19:10:34.000000000 -0700
@@ -430,7 +430,7 @@
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
swp_entry_t entry, struct page *page)
{
- vma->vm_mm->rss++;
+ atomic_inc(&vma->vm_mm->mm_rss);
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
page_add_anon_rmap(page, vma, address);
Index: linux-2.6.9-rc1/include/linux/mm.h
===================================================================
--- linux-2.6.9-rc1.orig/include/linux/mm.h 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/include/linux/mm.h 2004-08-27 19:10:34.000000000 -0700
@@ -622,7 +622,7 @@
*/
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
- if (pgd_none(*pgd))
+ if (unlikely(pgd_none(*pgd)))
return __pmd_alloc(mm, pgd, address);
return pmd_offset(pgd, address);
}
Index: linux-2.6.9-rc1/include/asm-generic/pgtable.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-generic/pgtable.h 2004-08-24 00:02:25.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-generic/pgtable.h 2004-08-27 19:10:34.000000000 -0700
@@ -85,6 +85,20 @@
}
#endif

+#ifndef __HAVE_ARCH_PTEP_XCHG
+/* Implementatio not suitable for SMP */
+static inline pte_t ptep_xchg(pte_t *ptep,pte_t pteval)
+{
+ unsigned int flags;
+
+ local_irq_save(flags);
+ pte_t pte = *ptep;
+ set_pte(ptep, pteval);
+ local_irq_restore(flags);
+ return pte;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
#define ptep_clear_flush(__vma, __address, __ptep) \
({ \
@@ -94,6 +108,31 @@
})
#endif

+#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
+#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
+({ \
+ pte_t __pte = ptep_xchg(__ptep, __pteval); \
+ flush_tlb_page(__vma, __address); \
+ __pte; \
+})
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CMPXCHG
+/* Implementation is not suitable for SMP */
+static inline pte_t ptep_cmpxchg(pte_t *ptep, pte_t oldval, pte_t newval)
+{
+ unsigned flags;
+ pte_t val;
+
+ local_irq_save(flags);
+ val= *ptep;
+ if (pte_same(val, oldval)) set_pte(ptep, newval);
+ local_irq_restore(flags);
+ return pte_same(val, oldval);
+}
+#endif
+
+
#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(pte_t *ptep)
{
Index: linux-2.6.9-rc1/fs/binfmt_aout.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/binfmt_aout.c 2004-08-27 19:09:51.000000000 -0700
+++ linux-2.6.9-rc1/fs/binfmt_aout.c 2004-08-27 19:10:34.000000000 -0700
@@ -309,7 +309,7 @@
(current->mm->start_brk = N_BSSADDR(ex));
current->mm->free_area_cache = current->mm->mmap_base;

- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss, 0);
current->mm->mmap = NULL;
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
Index: linux-2.6.9-rc1/include/asm-i386/pgtable-2level.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-i386/pgtable-2level.h 2004-08-27 19:09:56.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-i386/pgtable-2level.h 2004-08-27 19:10:34.000000000 -0700
@@ -40,6 +40,8 @@
return (pmd_t *) dir;
}
#define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0))
+#define ptep_xchg(xp,a) __pte(xchg(&(xp)->pte_low, (a).pte_low))
+#define ptep_cmpxchg(xp,oldpte,newpte) (cmpxchg(&(xp)->pte_low, (oldpte).pte_low, (newpte).pte_low)==(oldpte).pte_low)
#define pte_same(a, b) ((a).pte_low == (b).pte_low)
#define pte_page(x) pfn_to_page(pte_pfn(x))
#define pte_none(x) (!(x).pte_low)
Index: linux-2.6.9-rc1/arch/ia64/mm/hugetlbpage.c
===================================================================
--- linux-2.6.9-rc1.orig/arch/ia64/mm/hugetlbpage.c 2004-08-24 00:02:47.000000000 -0700
+++ linux-2.6.9-rc1/arch/ia64/mm/hugetlbpage.c 2004-08-27 19:10:34.000000000 -0700
@@ -65,7 +65,7 @@
{
pte_t entry;

- mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &mm->mm_rss);
if (write_access) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -108,7 +108,7 @@
ptepage = pte_page(entry);
get_page(ptepage);
set_pte(dst_pte, entry);
- dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+ atomic_add(HPAGE_SIZE / PAGE_SIZE, &dst->mm_rss);
addr += HPAGE_SIZE;
}
return 0;
@@ -249,7 +249,7 @@
put_page(page);
pte_clear(pte);
}
- mm->rss -= (end - start) >> PAGE_SHIFT;
+ atomic_sub((end - start) >> PAGE_SHIFT, &mm->mm_rss);
flush_tlb_range(vma, start, end);
}

Index: linux-2.6.9-rc1/include/asm-ia64/pgtable.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-ia64/pgtable.h 2004-08-27 19:09:56.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-ia64/pgtable.h 2004-08-27 19:10:34.000000000 -0700
@@ -387,6 +387,26 @@
#endif
}

+static inline pte_t
+ptep_xchg (pte_t *ptep,pte_t pteval)
+{
+#ifdef CONFIG_SMP
+ return __pte(xchg((long *) ptep, pteval.pte));
+#else
+ pte_t pte = *ptep;
+ set_pte(ptep,pteval);
+ return pte;
+#endif
+}
+
+#ifdef CONFIG_SMP
+static inline int
+ptep_cmpxchg (pte_t *ptep, pte_t oldval, pte_t newval)
+{
+ return ia64_cmpxchg8_acq(&ptep->pte, newval.pte, oldval.pte) == oldval.pte;
+}
+#endif
+
static inline void
ptep_set_wrprotect (pte_t *ptep)
{
@@ -554,10 +574,14 @@
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define __HAVE_ARCH_PTEP_XCHG
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define __HAVE_ARCH_PTEP_MKDIRTY
#define __HAVE_ARCH_PTE_SAME
#define __HAVE_ARCH_PGD_OFFSET_GATE
+#ifdef CONFIG_SMP
+#define __HAVE_ARCH_PTEP_CMPXCHG
+#endif
#include <asm-generic/pgtable.h>

#endif /* _ASM_IA64_PGTABLE_H */
Index: linux-2.6.9-rc1/fs/proc/array.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/proc/array.c 2004-08-27 19:09:54.000000000 -0700
+++ linux-2.6.9-rc1/fs/proc/array.c 2004-08-27 19:10:34.000000000 -0700
@@ -387,7 +387,7 @@
jiffies_to_clock_t(task->it_real_value),
start_time,
vsize,
- mm ? mm->rss : 0, /* you might want to shift this left 3 */
+ mm ? (unsigned long)atomic_read(&mm->mm_rss) : 0, /* you might want to shift this left 3 */
task->rlim[RLIMIT_RSS].rlim_cur,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
Index: linux-2.6.9-rc1/fs/binfmt_elf.c
===================================================================
--- linux-2.6.9-rc1.orig/fs/binfmt_elf.c 2004-08-27 19:09:53.000000000 -0700
+++ linux-2.6.9-rc1/fs/binfmt_elf.c 2004-08-27 19:18:15.000000000 -0700
@@ -708,7 +708,7 @@

/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->rss = 0;
+ atomic_set(&current->mm->mm_rss, 0);
current->mm->free_area_cache = current->mm->mmap_base;
retval = setup_arg_pages(bprm, executable_stack);
if (retval < 0) {
Index: linux-2.6.9-rc1/include/asm-ia64/tlb.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-ia64/tlb.h 2004-08-27 19:09:56.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-ia64/tlb.h 2004-08-27 19:10:34.000000000 -0700
@@ -46,6 +46,7 @@
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/machvec.h>
+#include <asm/atomic.h>

#ifdef CONFIG_SMP
# define FREE_PTE_NR 2048
@@ -161,11 +162,11 @@
{
unsigned long freed = tlb->freed;
struct mm_struct *mm = tlb->mm;
- unsigned long rss = mm->rss;
+ unsigned long rss = atomic_read(&mm->mm_rss);

if (rss < freed)
freed = rss;
- mm->rss = rss - freed;
+ atomic_set(&mm->mm_rss, rss - freed);
/*
* Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
* tlb->end_addr.
Index: linux-2.6.9-rc1/include/asm-i386/pgalloc.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-i386/pgalloc.h 2004-08-24 00:01:53.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-i386/pgalloc.h 2004-08-27 19:10:34.000000000 -0700
@@ -7,6 +7,8 @@
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */

+#define PTE_NONE 0L
+
#define pmd_populate_kernel(mm, pmd, pte) \
set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))

@@ -16,6 +18,18 @@
((unsigned long long)page_to_pfn(pte) <<
(unsigned long long) PAGE_SHIFT)));
}
+
+static inline int pmd_test_and_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+#ifdef CONFIG_X86_PAE
+ return cmpxchg8b( ((unsigned long long *)pmd), PTE_NONE, _PAGE_TABLE +
+ ((unsigned long long)page_to_pfn(pte) <<
+ (unsigned long long) PAGE_SHIFT) ) == PTE_NONE;
+#else
+ return cmpxchg( (unsigned long *)pmd, PTE_NONE, _PAGE_TABLE + (page_to_pfn(pte) << PAGE_SHIFT)) == PTE_NONE;
+#endif
+}
+
/*
* Allocate and free page tables.
*/
@@ -49,6 +63,7 @@
#define pmd_free(x) do { } while (0)
#define __pmd_free_tlb(tlb,x) do { } while (0)
#define pgd_populate(mm, pmd, pte) BUG()
+#define pgd_test_and_populate(mm, pmd, pte) ({ BUG(); 1; })

#define check_pgt_cache() do { } while (0)

Index: linux-2.6.9-rc1/mm/rmap.c
===================================================================
--- linux-2.6.9-rc1.orig/mm/rmap.c 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/mm/rmap.c 2004-08-27 19:33:38.000000000 -0700
@@ -262,7 +262,7 @@
pte_t *pte;
int referenced = 0;

- if (!mm->rss)
+ if (!atomic_read(&mm->mm_rss))
goto out;
address = vma_address(page, vma);
if (address == -EFAULT)
@@ -291,7 +291,7 @@
if (mm != current->mm && has_swap_token(mm))
referenced++;

- if (mm->rss > mm->rlimit_rss)
+ if (atomic_read(&mm->mm_rss) > mm->rlimit_rss)
referenced = 0;

(*mapcount)--;
@@ -422,7 +422,10 @@
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the mm->page_table_lock if page
+ * is pointing to something that is known by the vm.
+ * The lock does not need to be held if page is pointing
+ * to a newly allocated page.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
@@ -503,7 +506,7 @@
pte_t pteval;
int ret = SWAP_AGAIN;

- if (!mm->rss)
+ if (!atomic_read(&mm->mm_rss))
goto out;
address = vma_address(page, vma);
if (address == -EFAULT)
@@ -564,11 +567,6 @@

/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);

if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
@@ -578,11 +576,16 @@
*/
BUG_ON(!PageSwapCache(page));
swap_duplicate(entry);
- set_pte(pte, swp_entry_to_pte(entry));
+ pteval = ptep_xchg_flush(vma, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- }
+ } else
+ pteval = ptep_clear_flush(vma, address, pte);

- mm->rss--;
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ atomic_dec(&mm->mm_rss);
page_remove_rmap(page);
page_cache_release(page);

@@ -670,11 +673,12 @@

/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);

/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
- set_pte(pte, pgoff_to_pte(page->index));
+ pteval = ptep_xchg_flush(vma, address, pte, pgoff_to_pte(page->index));
+ else
+ pteval = ptep_clear_flush(vma, address, pte);

/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -682,7 +686,7 @@

page_remove_rmap(page);
page_cache_release(page);
- mm->rss--;
+ atomic_dec(&mm->mm_rss);
(*mapcount)--;
}

@@ -781,7 +785,7 @@
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
continue;
cursor = (unsigned long) vma->vm_private_data;
- while (vma->vm_mm->rss &&
+ while (atomic_read(&vma->vm_mm->mm_rss) &&
cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
try_to_unmap_cluster(cursor, &mapcount, vma);
Index: linux-2.6.9-rc1/include/asm-i386/system.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-i386/system.h 2004-08-24 00:01:51.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-i386/system.h 2004-08-27 19:10:34.000000000 -0700
@@ -203,77 +203,6 @@
__set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
__set_64bit(ptr, ll_low(value), ll_high(value)) )

-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
- */
-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
-{
- switch (size) {
- case 1:
- __asm__ __volatile__("xchgb %b0,%1"
- :"=q" (x)
- :"m" (*__xg(ptr)), "0" (x)
- :"memory");
- break;
- case 2:
- __asm__ __volatile__("xchgw %w0,%1"
- :"=r" (x)
- :"m" (*__xg(ptr)), "0" (x)
- :"memory");
- break;
- case 4:
- __asm__ __volatile__("xchgl %0,%1"
- :"=r" (x)
- :"m" (*__xg(ptr)), "0" (x)
- :"memory");
- break;
- }
- return x;
-}
-
-/*
- * Atomic compare and exchange. Compare OLD with MEM, if identical,
- * store NEW in MEM. Return the initial value in MEM. Success is
- * indicated by comparing RETURN with OLD.
- */
-
-#ifdef CONFIG_X86_CMPXCHG
-#define __HAVE_ARCH_CMPXCHG 1
-#endif
-
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
-#define cmpxchg(ptr,o,n)\
- ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
- (unsigned long)(n),sizeof(*(ptr))))
-
#ifdef __KERNEL__
struct alt_instr {
__u8 *instr; /* original instruction */
Index: linux-2.6.9-rc1/arch/i386/Kconfig
===================================================================
--- linux-2.6.9-rc1.orig/arch/i386/Kconfig 2004-08-27 19:09:43.000000000 -0700
+++ linux-2.6.9-rc1/arch/i386/Kconfig 2004-08-27 19:10:34.000000000 -0700
@@ -341,6 +341,11 @@
depends on !M386
default y

+config X86_CMPXCHG8B
+ bool
+ depends on !M386 && !M486
+ default y
+
config X86_XADD
bool
depends on !M386
Index: linux-2.6.9-rc1/include/asm-i386/processor.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-i386/processor.h 2004-08-27 19:09:56.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-i386/processor.h 2004-08-27 19:10:34.000000000 -0700
@@ -651,4 +651,138 @@

#define cache_line_size() (boot_cpu_data.x86_cache_alignment)

+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
+{
+ switch (size) {
+ case 1:
+ __asm__ __volatile__("xchgb %b0,%1"
+ :"=q" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 2:
+ __asm__ __volatile__("xchgw %w0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ case 4:
+ __asm__ __volatile__("xchgl %0,%1"
+ :"=r" (x)
+ :"m" (*__xg(ptr)), "0" (x)
+ :"memory");
+ break;
+ }
+ return x;
+}
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+#ifdef CONFIG_X86_CMPXCHG
+#define __HAVE_ARCH_CMPXCHG 1
+#else
+#endif
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+#ifndef CONFIG_X86_CMPXCHG
+ /*
+ * Check if the kernel was compiled for an old cpu but the
+ * currently running cpu can do cmpxchg after all
+ */
+ unsigned long flags;
+
+ /* All CPUs except 386 support CMPXCHG */
+ if (cpu_data->x86 > 3) goto have_cmpxchg;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ switch (size) {
+ case 1:
+ prev = * (u8 *)ptr;
+ if (prev == old) *(u8 *)ptr = new;
+ break;
+ case 2:
+ prev = * (u16 *)ptr;
+ if (prev == old) *(u16 *)ptr = new;
+ case 4:
+ prev = *(u32 *)ptr;
+ if (prev == old) *(u32 *)ptr = new;
+ break;
+ }
+ local_irq_restore(flags);
+ return prev;
+have_cmpxchg:
+#endif
+ switch (size) {
+ case 1:
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return prev;
+}
+
+static inline unsigned long long cmpxchg8b(volatile unsigned long long *ptr,
+ unsigned long long old, unsigned long long newv)
+{
+ unsigned long long prev;
+#ifndef CONFIG_X86_CMPXCHG8B
+ unsigned long flags;
+
+ /*
+ * Check if the kernel was compiled for an old cpu but
+ * we are running really on a cpu capable of cmpxchg8b
+ */
+
+ if (cpu_has(cpu_data,X86_FEATURE_CX8)) goto have_cmpxchg8b;
+
+ /* Poor mans cmpxchg8b for 386 and 486. Not suitable for SMP */
+ local_irq_save(flags);
+ prev = *ptr;
+ if (prev == old) *ptr = newv;
+ local_irq_restore(flags);
+ return prev;
+
+have_cmpxchg8b:
+#endif
+
+ __asm__ __volatile__(
+ LOCK_PREFIX "cmpxchg8b %4\n"
+ : "=A" (prev)
+ : "0" (old), "c" ((unsigned long)(newv >> 32)),
+ "b" ((unsigned long)(newv & 0xffffffffLL)), "m" (ptr)
+ : "memory");
+ return prev ;
+}
+
+#define cmpxchg(ptr,o,n)\
+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
+ (unsigned long)(n),sizeof(*(ptr))))
+
#endif /* __ASM_I386_PROCESSOR_H */
Index: linux-2.6.9-rc1/include/asm-x86_64/pgalloc.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-x86_64/pgalloc.h 2004-08-24 00:02:47.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-x86_64/pgalloc.h 2004-08-27 19:10:34.000000000 -0700
@@ -7,16 +7,26 @@
#include <linux/threads.h>
#include <linux/mm.h>

+#define PMD_NONE 0
+#define PTE_NONE 0
+
#define pmd_populate_kernel(mm, pmd, pte) \
set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
#define pgd_populate(mm, pgd, pmd) \
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd)))
+#define pgd_test_and_populate(mm, pgd, pmd) \
+ (cmpxchg(pgd, PMD_NONE, _PAGE_TABLE | __pa(pmd)) == PMD_NONE)

static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
{
set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
}

+static inline int pmd_test_and_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+ return cmpxchg(pmd, PTE_NONE, _PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)) == PTE_NONE;
+}
+
extern __inline__ pmd_t *get_pmd(void)
{
return (pmd_t *)get_zeroed_page(GFP_KERNEL);
Index: linux-2.6.9-rc1/include/asm-x86_64/pgtable.h
===================================================================
--- linux-2.6.9-rc1.orig/include/asm-x86_64/pgtable.h 2004-08-24 00:03:19.000000000 -0700
+++ linux-2.6.9-rc1/include/asm-x86_64/pgtable.h 2004-08-27 19:10:35.000000000 -0700
@@ -102,6 +102,8 @@
((unsigned long) __va(pgd_val(pgd) & PHYSICAL_PAGE_MASK))

#define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0))
+#define ptep_xchg(xp,newval) __pte(xchg(&(xp)->pte, pte_val(newval))
+#define ptep_cmpxchg(xp,newval,oldval) (cmpxchg(&(xp)->pte, pte_val(newval), pte_val(oldval) == pte_val(oldval))
#define pte_same(a, b) ((a).pte == (b).pte)

#define PML4_SIZE (1UL << PML4_SHIFT)
@@ -442,6 +444,8 @@
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define __HAVE_ARCH_PTEP_MKDIRTY
#define __HAVE_ARCH_PTE_SAME
+#define __HAVE_ARCH_PTEP_XCHG
+#define __HAVE_ARCH_PTEP_CMPXCHG
#include <asm-generic/pgtable.h>

#endif /* _X86_64_PGTABLE_H */
Index: linux-2.6.9-rc1/mm/thrash.c
===================================================================
--- linux-2.6.9-rc1.orig/mm/thrash.c 2004-08-27 19:09:57.000000000 -0700
+++ linux-2.6.9-rc1/mm/thrash.c 2004-08-27 19:28:20.000000000 -0700
@@ -35,7 +35,7 @@
ret = SWAP_TOKEN_ENOUGH_RSS;
else if (time_after(jiffies, swap_token_timeout))
ret = SWAP_TOKEN_TIMED_OUT;
- else if (mm->rss > mm->rlimit_rss)
+ else if (atomic_read(&mm->mm_rss) > mm->rlimit_rss)
ret = SWAP_TOKEN_ENOUGH_RSS;
mm->recent_pagein = 0;
return ret;
@@ -61,7 +61,7 @@
if (time_after(jiffies, swap_token_check)) {

/* Can't get swapout protection if we exceed our RSS limit. */
- if (current->mm->rss > current->mm->rlimit_rss)
+ if (atomic_read(&current->mm->mm_rss) > current->mm->rlimit_rss)
return;

/* ... or if we recently held the token. */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/