[PATCH] [RFC] [CFT] rmap 15a for 2.4.20-rc2-ac3

From: Paul P Komkoff Jr (i@stingr.net)
Date: Tue Nov 26 2002 - 11:31:01 EST


The subject clearly described the patch.

The only thing I'm unsure about is current->mm->... etc in
kernel/sched.c

Enjoy.

diff -Nru a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c Tue Nov 26 19:23:38 2002
+++ b/fs/buffer.c Tue Nov 26 19:23:38 2002
@@ -2915,6 +2915,30 @@
         }
 }
 
+
+/*
+ * Do some IO post-processing here!!!
+ */
+void do_io_postprocessing(void)
+{
+ int i;
+ struct buffer_head *bh, *next;
+
+ spin_lock(&lru_list_lock);
+ bh = lru_list[BUF_LOCKED];
+ if (bh) {
+ for (i = nr_buffers_type[BUF_LOCKED]; i-- > 0; bh = next) {
+ next = bh->b_next_free;
+
+ if (!buffer_locked(bh))
+ __refile_buffer(bh);
+ else
+ break;
+ }
+ }
+ spin_unlock(&lru_list_lock);
+}
+
 /*
  * This is the kernel update daemon. It was used to live in userspace
  * but since it's need to run safely we want it unkillable by mistake.
@@ -2966,6 +2990,7 @@
 #ifdef DEBUG
                 printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
+ do_io_postprocessing();
                 sync_old_buffers();
                 run_task_queue(&tq_disk);
         }
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c Tue Nov 26 19:23:38 2002
+++ b/fs/proc/proc_misc.c Tue Nov 26 19:23:38 2002
@@ -191,7 +191,10 @@
                 "Cached: %8lu kB\n"
                 "SwapCached: %8lu kB\n"
                 "Active: %8u kB\n"
+ "ActiveAnon: %8u kB\n"
+ "ActiveCache: %8u kB\n"
                 "Inact_dirty: %8u kB\n"
+ "Inact_laundry:%8u kB\n"
                 "Inact_clean: %8u kB\n"
                 "Inact_target: %8u kB\n"
                 "HighTotal: %8lu kB\n"
@@ -207,9 +210,12 @@
                 K(i.bufferram),
                 K(pg_size - swapper_space.nrpages),
                 K(swapper_space.nrpages),
- K(nr_active_pages),
- K(nr_inactive_dirty_pages),
- K(nr_inactive_clean_pages),
+ K(nr_active_anon_pages()) + K(nr_active_cache_pages()),
+ K(nr_active_anon_pages()),
+ K(nr_active_cache_pages()),
+ K(nr_inactive_dirty_pages()),
+ K(nr_inactive_laundry_pages()),
+ K(nr_inactive_clean_pages()),
                 K(inactive_target()),
                 K(i.totalhigh),
                 K(i.freehigh),
diff -Nru a/include/linux/brlock.h b/include/linux/brlock.h
--- a/include/linux/brlock.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/brlock.h Tue Nov 26 19:23:38 2002
@@ -37,6 +37,8 @@
         BR_GLOBALIRQ_LOCK,
         BR_NETPROTO_LOCK,
         BR_LLC_LOCK,
+ BR_LRU_LOCK,
+
         __BR_END
 };
 
diff -Nru a/include/linux/list.h b/include/linux/list.h
--- a/include/linux/list.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/list.h Tue Nov 26 19:23:38 2002
@@ -137,8 +137,7 @@
         return head->next == head;
 }
 
-static inline void __list_splice(struct list_head *list,
- struct list_head *head)
+static inline void __list_splice(struct list_head *list, struct list_head *head)
 {
         struct list_head *first = list->next;
         struct list_head *last = list->prev;
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/mm.h Tue Nov 26 19:23:38 2002
@@ -1,5 +1,23 @@
 #ifndef _LINUX_MM_H
 #define _LINUX_MM_H
+/*
+ * Copyright (c) 2002. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors:
+ * Linus Torvalds
+ * Stephen Tweedie
+ * Andrea Arcangeli
+ * Rik van Riel
+ * Arjan van de Ven
+ * and others
+ */
 
 #include <linux/sched.h>
 #include <linux/errno.h>
@@ -168,7 +186,7 @@
         unsigned long flags; /* atomic flags, some possibly
                                            updated asynchronously */
         struct list_head lru; /* Pageout list, eg. active_list;
- protected by pagemap_lru_lock !! */
+ protected by the lru lock !! */
         unsigned char age; /* Page aging counter. */
         struct pte_chain * pte_chain; /* Reverse pte mapping pointer.
                                          * protected by PG_chainlock
@@ -279,7 +297,7 @@
  *
  * Note that the referenced bit, the page->lru list_head and the
  * active, inactive_dirty and inactive_clean lists are protected by
- * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
+ * the lru lock, and *NOT* by the usual PG_locked bit!
  *
  * PG_skip is used on sparc/sparc64 architectures to "skip" certain
  * parts of the address space.
@@ -300,18 +318,21 @@
 #define PG_referenced 2
 #define PG_uptodate 3
 #define PG_dirty 4
-#define PG_inactive_clean 5
-#define PG_active 6
+#define PG_active_anon 5
 #define PG_inactive_dirty 7
-#define PG_slab 8
-#define PG_skip 10
-#define PG_highmem 11
-#define PG_checked 12 /* kill me in 2.5.<early>. */
-#define PG_arch_1 13
-#define PG_reserved 14
-#define PG_launder 15 /* written out by VM pressure.. */
-#define PG_chainlock 16 /* lock bit for ->pte_chain */
-#define PG_lru 17
+#define PG_inactive_laundry 8
+#define PG_inactive_clean 9
+#define PG_slab 10
+#define PG_skip 11
+#define PG_highmem 12
+#define PG_checked 13 /* kill me in 2.5.<early>. */
+#define PG_arch_1 14
+#define PG_reserved 15
+#define PG_launder 16 /* written out by VM pressure.. */
+#define PG_chainlock 17 /* lock bit for ->pte_chain */
+#define PG_lru 18
+#define PG_active_cache 19
+
 /* Don't you dare to use high bits, they seem to be used for something else! */
 
 
@@ -429,11 +450,21 @@
 #define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags)
 #define PageReserved(page) test_bit(PG_reserved, &(page)->flags)
 
-#define PageActive(page) test_bit(PG_active, &(page)->flags)
-#define SetPageActive(page) set_bit(PG_active, &(page)->flags)
-#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags)
-#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
-#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
+#define PageActiveAnon(page) test_bit(PG_active_anon, &(page)->flags)
+#define SetPageActiveAnon(page) set_bit(PG_active_anon, &(page)->flags)
+#define ClearPageActiveAnon(page) clear_bit(PG_active_anon, &(page)->flags)
+#define TestandSetPageActiveAnon(page) test_and_set_bit(PG_active_anon, &(page)->flags)
+#define TestandClearPageActiveAnon(page) test_and_clear_bit(PG_active_anon, &(page)->flags)
+
+#define PageActiveCache(page) test_bit(PG_active_cache, &(page)->flags)
+#define SetPageActiveCache(page) set_bit(PG_active_cache, &(page)->flags)
+#define ClearPageActiveCache(page) clear_bit(PG_active_cache, &(page)->flags)
+#define TestandSetPageActiveCache(page) test_and_set_bit(PG_active_cache, &(page)->flags)
+#define TestandClearPageActiveCache(page) test_and_clear_bit(PG_active_cache, &(page)->flags)
+
+#define PageInactiveLaundry(page) test_bit(PG_inactive_laundry, &(page)->flags)
+#define SetPageInactiveLaundry(page) set_bit(PG_inactive_laundry, &(page)->flags)
+#define ClearPageInactiveLaundry(page) clear_bit(PG_inactive_laundry, &(page)->flags)
 
 #define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags)
 #define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags)
diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h
--- a/include/linux/mm_inline.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/mm_inline.h Tue Nov 26 19:23:38 2002
@@ -2,23 +2,125 @@
 #define _LINUX_MM_INLINE_H
 
 #include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/brlock.h>
+
+
+/*
+ * Copyright (c) 2002. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors:
+ * Linus Torvalds
+ * Stephen Tweedie
+ * Andrea Arcangeli
+ * Rik van Riel
+ * Arjan van de Ven
+ * and others
+ */
+
+GPL_HEADER()
+
+extern unsigned char active_age_bias;
 
 /*
  * These inline functions tend to need bits and pieces of all the
  * other VM include files, meaning they cannot be defined inside
  * one of the other VM include files.
+ *
+ */
+
+/**
+ * page_dirty - do we need to write the data out to disk
+ * @page: page to test
+ *
+ * Returns true if the page contains data which needs to
+ * be written to disk. Doesn't test the page tables (yet?).
+ */
+static inline int page_dirty(struct page *page)
+{
+ struct buffer_head *tmp, *bh;
+
+ if (PageDirty(page))
+ return 1;
+
+ if (page->mapping && !page->buffers)
+ return 0;
+
+ tmp = bh = page->buffers;
+
+ do {
+ if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
+ return 1;
+ tmp = tmp->b_this_page;
+ } while (tmp != bh);
+
+ return 0;
+}
+
+/**
+ * page_anon - is this page ram/swap backed ?
+ * @page - page to test
  *
- * The include file mess really needs to be cleaned up...
+ * Returns 1 if the page is backed by ram/swap, 0 if the page is
+ * backed by a file in a filesystem on permanent storage.
  */
+static inline int page_anon(struct page * page)
+{
+ /* Pages of an mmap()d file won't trigger this unless they get
+ * referenced on the inactive list and really are in the working
+ * set of the process... */
+ if (page->pte_chain)
+ return 1;
+
+ if (!page->mapping && !page->buffers)
+ return 1;
+
+ if (PageSwapCache(page))
+ return 1;
+
+ if (!page->mapping->a_ops->writepage)
+ return 1;
 
-static inline void add_page_to_active_list(struct page * page)
+ /* TODO: ramfs, tmpfs shm segments and ramdisk */
+
+ return 0;
+}
+
+
+
+static inline void add_page_to_active_anon_list(struct page * page, int age)
 {
         struct zone_struct * zone = page_zone(page);
         DEBUG_LRU_PAGE(page);
- SetPageActive(page);
- list_add(&page->lru, &zone->active_list);
- zone->active_pages++;
- nr_active_pages++;
+ SetPageActiveAnon(page);
+ list_add(&page->lru, &zone->active_anon_list[age]);
+ page->age = age + active_age_bias;
+ zone->active_anon_pages++;
+}
+
+static inline void add_page_to_active_cache_list(struct page * page, int age)
+{
+ struct zone_struct * zone = page_zone(page);
+ DEBUG_LRU_PAGE(page);
+ SetPageActiveCache(page);
+ list_add(&page->lru, &zone->active_cache_list[age]);
+ page->age = age + active_age_bias;
+ zone->active_cache_pages++;
+}
+
+static inline void add_page_to_active_list(struct page * page, int age)
+{
+ if (page_anon(page))
+ add_page_to_active_anon_list(page, age);
+ else
+ add_page_to_active_cache_list(page, age);
 }
 
 static inline void add_page_to_inactive_dirty_list(struct page * page)
@@ -28,7 +130,15 @@
         SetPageInactiveDirty(page);
         list_add(&page->lru, &zone->inactive_dirty_list);
         zone->inactive_dirty_pages++;
- nr_inactive_dirty_pages++;
+}
+
+static inline void add_page_to_inactive_laundry_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ DEBUG_LRU_PAGE(page);
+ SetPageInactiveLaundry(page);
+ list_add(&page->lru, &zone->inactive_laundry_list);
+ zone->inactive_laundry_pages++;
 }
 
 static inline void add_page_to_inactive_clean_list(struct page * page)
@@ -38,16 +148,31 @@
         SetPageInactiveClean(page);
         list_add(&page->lru, &zone->inactive_clean_list);
         zone->inactive_clean_pages++;
- nr_inactive_clean_pages++;
 }
 
-static inline void del_page_from_active_list(struct page * page)
+static inline void del_page_from_active_anon_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ unsigned char age;
+ list_del(&page->lru);
+ ClearPageActiveAnon(page);
+ zone->active_anon_pages--;
+ age = page->age - active_age_bias;
+ if (age<=MAX_AGE)
+ zone->active_anon_count[age]--;
+ DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_active_cache_list(struct page * page)
 {
         struct zone_struct * zone = page_zone(page);
+ unsigned char age;
         list_del(&page->lru);
- ClearPageActive(page);
- nr_active_pages--;
- zone->active_pages--;
+ ClearPageActiveCache(page);
+ zone->active_cache_pages--;
+ age = page->age - active_age_bias;
+ if (age<=MAX_AGE)
+ zone->active_cache_count[age]--;
         DEBUG_LRU_PAGE(page);
 }
 
@@ -56,18 +181,25 @@
         struct zone_struct * zone = page_zone(page);
         list_del(&page->lru);
         ClearPageInactiveDirty(page);
- nr_inactive_dirty_pages--;
         zone->inactive_dirty_pages--;
         DEBUG_LRU_PAGE(page);
 }
 
+static inline void del_page_from_inactive_laundry_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ list_del(&page->lru);
+ ClearPageInactiveLaundry(page);
+ zone->inactive_laundry_pages--;
+ DEBUG_LRU_PAGE(page);
+}
+
 static inline void del_page_from_inactive_clean_list(struct page * page)
 {
         struct zone_struct * zone = page_zone(page);
         list_del(&page->lru);
         ClearPageInactiveClean(page);
         zone->inactive_clean_pages--;
- nr_inactive_clean_pages--;
         DEBUG_LRU_PAGE(page);
 }
 
@@ -184,7 +316,8 @@
 {
         int inactive, target, inactive_base;
 
- inactive_base = zone->active_pages + zone->inactive_dirty_pages;
+ inactive_base = zone->active_anon_pages + zone->active_cache_pages
+ + zone->inactive_dirty_pages;
         inactive_base /= INACTIVE_FACTOR;
 
         /* GCC should optimise this away completely. */
@@ -253,7 +386,13 @@
  */
 static inline int inactive_high(struct zone_struct * zone)
 {
- return inactive_limit(zone, VM_HIGH);
+ unsigned long active, inactive;
+ active = zone->active_anon_pages + zone->active_cache_pages
+ + zone->free_pages;
+ inactive = zone->inactive_dirty_pages + zone->inactive_clean_pages + zone->inactive_laundry_pages;
+ if (inactive * 5 > (active+inactive))
+ return -1;
+ return 1;
 }
 
 /*
@@ -263,12 +402,33 @@
 {
         int target;
 
- target = nr_active_pages + nr_inactive_dirty_pages
- + nr_inactive_clean_pages;
+ target = nr_active_anon_pages() + nr_active_cache_pages()
+ + nr_inactive_dirty_pages() + nr_inactive_clean_pages()
+ + nr_inactive_laundry_pages();
 
         target /= INACTIVE_FACTOR;
 
         return target;
+}
+
+static inline void lru_lock(struct zone_struct *zone)
+{
+ if (zone) {
+ br_read_lock(BR_LRU_LOCK);
+ spin_lock(&zone->lru_lock);
+ } else {
+ br_write_lock(BR_LRU_LOCK);
+ }
+}
+
+static inline void lru_unlock(struct zone_struct *zone)
+{
+ if (zone) {
+ spin_unlock(&zone->lru_lock);
+ br_read_unlock(BR_LRU_LOCK);
+ } else {
+ br_write_unlock(BR_LRU_LOCK);
+ }
 }
 
 #endif /* _LINUX_MM_INLINE_H */
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/mmzone.h Tue Nov 26 19:23:38 2002
@@ -13,11 +13,7 @@
  * Free memory management - zoned buddy allocator.
  */
 
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
 #define MAX_ORDER 10
-#else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
-#endif
 
 typedef struct free_area_struct {
         struct list_head free_list;
@@ -29,6 +25,9 @@
 
 #define MAX_CHUNKS_PER_NODE 8
 
+#define MAX_AGE 15
+#define INITIAL_AGE 3
+
 #define MAX_PER_CPU_PAGES 512
 typedef struct per_cpu_pages_s {
         int nr_pages, max_nr_pages;
@@ -50,19 +49,27 @@
         per_cpu_t cpu_pages[NR_CPUS];
         spinlock_t lock;
         unsigned long free_pages;
- unsigned long active_pages;
+ unsigned long active_anon_pages;
+ unsigned long active_cache_pages;
         unsigned long inactive_dirty_pages;
+ unsigned long inactive_laundry_pages;
         unsigned long inactive_clean_pages;
         unsigned long pages_min, pages_low, pages_high, pages_plenty;
         int need_balance;
+ int need_scan;
+ int active_anon_count[MAX_AGE+1];
+ int active_cache_count[MAX_AGE+1];
 
         /*
          * free areas of different sizes
          */
- struct list_head active_list;
+ struct list_head active_anon_list[MAX_AGE+1];
+ struct list_head active_cache_list[MAX_AGE+1];
         struct list_head inactive_dirty_list;
+ struct list_head inactive_laundry_list;
         struct list_head inactive_clean_list;
         free_area_t free_area[MAX_ORDER];
+ spinlock_t lru_lock;
 
         /*
          * wait_table -- the array holding the hash table
diff -Nru a/include/linux/module.h b/include/linux/module.h
--- a/include/linux/module.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/module.h Tue Nov 26 19:23:38 2002
@@ -287,6 +287,9 @@
 static const char __module_license[] __attribute__((section(".modinfo"))) = \
 "license=" license
 
+#define GPL_HEADER() \
+static const char cpyright="This software may be freely redistributed under the terms of the GNU General Public License.";
+
 /* Define the module variable, and usage macros. */
 extern struct module __this_module;
 
@@ -302,7 +305,6 @@
 static const char __module_using_checksums[] __attribute__((section(".modinfo"))) =
 "using_checksums=1";
 #endif
-
 #else /* MODULE */
 
 #define MODULE_AUTHOR(name)
@@ -311,6 +313,7 @@
 #define MODULE_SUPPORTED_DEVICE(name)
 #define MODULE_PARM(var,type)
 #define MODULE_PARM_DESC(var,desc)
+#define GPL_HEADER()
 
 /* Create a dummy reference to the table to suppress gcc unused warnings. Put
  * the reference in the .data.exit section which is discarded when code is built
diff -Nru a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/pagemap.h Tue Nov 26 19:23:38 2002
@@ -70,10 +70,6 @@
 
 #define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
 
-extern struct page * __find_get_page(struct address_space *mapping,
- unsigned long index, struct page **hash);
-#define find_get_page(mapping, index) \
- __find_get_page(mapping, index, page_hash(mapping, index))
 extern struct page * __find_lock_page (struct address_space * mapping,
                                 unsigned long index, struct page **hash);
 extern struct page * find_or_create_page(struct address_space *mapping,
@@ -90,6 +86,15 @@
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
 
 extern void ___wait_on_page(struct page *);
+extern int wait_on_page_timeout(struct page *page, int timeout);
+
+
+extern struct page * __find_pagecache_page(struct address_space *mapping,
+ unsigned long index, struct page **hash);
+#define find_pagecache_page(mapping, index) \
+ __find_pagecache_page(mapping, index, page_hash(mapping, index))
+#define find_get_page(mapping, index) \
+ __find_pagecache_page(mapping, index, page_hash(mapping, index))
 
 static inline void wait_on_page(struct page * page)
 {
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/sched.h Tue Nov 26 19:23:38 2002
@@ -260,6 +260,7 @@
         unsigned long def_flags;
         unsigned long cpu_vm_mask;
         unsigned long rlimit_rss;
+ unsigned long last_run;
 
         unsigned dumpable:1;
 
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h Tue Nov 26 19:23:38 2002
+++ b/include/linux/swap.h Tue Nov 26 19:23:38 2002
@@ -85,9 +85,11 @@
 
 extern unsigned int nr_free_pages(void);
 extern unsigned int nr_free_buffer_pages(void);
-extern int nr_active_pages;
-extern int nr_inactive_dirty_pages;
-extern int nr_inactive_clean_pages;
+extern unsigned int nr_active_anon_pages(void);
+extern unsigned int nr_active_cache_pages(void);
+extern unsigned int nr_inactive_dirty_pages(void);
+extern unsigned int nr_inactive_laundry_pages(void);
+extern unsigned int nr_inactive_clean_pages(void);
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
 extern spinlock_cacheline_t pagecache_lock_cacheline;
@@ -115,6 +117,7 @@
 
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
+extern void FASTCALL(lru_cache_add_dirty(struct page *));
 extern void FASTCALL(__lru_cache_del(struct page *));
 extern void FASTCALL(lru_cache_del(struct page *));
 
@@ -130,6 +133,7 @@
 extern wait_queue_head_t kswapd_wait;
 extern struct page * FASTCALL(reclaim_page(zone_t *));
 extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern int rebalance_laundry_zone(struct zone_struct *, int, unsigned int);
 extern void wakeup_kswapd(unsigned int);
 extern void rss_free_pages(unsigned int);
 
@@ -175,8 +179,6 @@
 asmlinkage long sys_swapoff(const char *);
 asmlinkage long sys_swapon(const char *, int);
 
-extern spinlock_cacheline_t pagemap_lru_lock_cacheline;
-#define pagemap_lru_lock pagemap_lru_lock_cacheline.lock
 
 extern void FASTCALL(mark_page_accessed(struct page *));
 
@@ -191,13 +193,17 @@
 
 /*
  * List add/del helper macros. These must be called
- * with the pagemap_lru_lock held!
+ * with the lru lock held!
  */
 #define DEBUG_LRU_PAGE(page) \
 do { \
- if (PageActive(page)) \
+ if (PageActiveAnon(page)) \
+ BUG(); \
+ if (PageActiveCache(page)) \
                 BUG(); \
         if (PageInactiveDirty(page)) \
+ BUG(); \
+ if (PageInactiveLaundry(page)) \
                 BUG(); \
         if (PageInactiveClean(page)) \
                 BUG(); \
diff -Nru a/kernel/ksyms.c b/kernel/ksyms.c
--- a/kernel/ksyms.c Tue Nov 26 19:23:38 2002
+++ b/kernel/ksyms.c Tue Nov 26 19:23:38 2002
@@ -262,7 +262,6 @@
 EXPORT_SYMBOL(__pollwait);
 EXPORT_SYMBOL(poll_freewait);
 EXPORT_SYMBOL(ROOT_DEV);
-EXPORT_SYMBOL(__find_get_page);
 EXPORT_SYMBOL(__find_lock_page);
 EXPORT_SYMBOL(find_or_create_page);
 EXPORT_SYMBOL(grab_cache_page_nowait);
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c Tue Nov 26 19:23:38 2002
+++ b/kernel/sched.c Tue Nov 26 19:23:38 2002
@@ -971,6 +971,8 @@
         reacquire_kernel_lock(current);
         if (need_resched())
                 goto need_resched;
+ if (current->mm)
+ current->mm->last_run = jiffies;
 }
 
 /*
diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c Tue Nov 26 19:23:38 2002
+++ b/mm/filemap.c Tue Nov 26 19:23:38 2002
@@ -55,15 +55,14 @@
 
 spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED};
 /*
- * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
+ * NOTE: to avoid deadlocking you must never acquire the lru lock
  * with the pagecache_lock held.
  *
  * Ordering:
  * swap_lock ->
- * pagemap_lru_lock ->
+ * lru lock ->
  * pagecache_lock
  */
-spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
 
 #define CLUSTER_PAGES (1 << page_cluster)
 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
@@ -183,7 +182,7 @@
 
         head = &inode->i_mapping->clean_pages;
 
- spin_lock(&pagemap_lru_lock);
+ lru_lock(ALL_ZONES);
         spin_lock(&pagecache_lock);
         curr = head->next;
 
@@ -216,7 +215,7 @@
         }
 
         spin_unlock(&pagecache_lock);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(ALL_ZONES);
 }
 
 static int do_flushpage(struct page *page, unsigned long offset)
@@ -880,6 +879,32 @@
                 wake_up_all(waitqueue);
 }
 
+
+/* like wait_on_page but with a timeout (in jiffies).
+ * returns 1 on timeout
+ */
+int wait_on_page_timeout(struct page *page, int timeout)
+{
+ wait_queue_head_t *waitqueue = page_waitqueue(page);
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ if (!PageLocked(page))
+ return 0;
+
+ add_wait_queue(waitqueue, &wait);
+ do {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!PageLocked(page))
+ break;
+ sync_page(page);
+ timeout = schedule_timeout(timeout);
+ } while (PageLocked(page) && timeout);
+ __set_task_state(tsk, TASK_RUNNING);
+ remove_wait_queue(waitqueue, &wait);
+ return PageLocked(page);
+}
+
 /*
  * Get a lock on the page, assuming we need to sleep
  * to get it..
@@ -914,26 +939,6 @@
                 __lock_page(page);
 }
 
-/*
- * a rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
- */
-struct page * __find_get_page(struct address_space *mapping,
- unsigned long offset, struct page **hash)
-{
- struct page *page;
-
- /*
- * We scan the hash list read-only. Addition to and removal from
- * the hash-list needs a held write-lock.
- */
- spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, offset, *hash);
- if (page)
- page_cache_get(page);
- spin_unlock(&pagecache_lock);
- return page;
-}
 
 /*
  * Same as above, but trylock it instead of incrementing the count.
@@ -1069,19 +1074,42 @@
          * been increased since the last time we were called, we
          * stop when the page isn't there.
          */
- spin_lock(&pagemap_lru_lock);
+ lru_lock(ALL_ZONES);
         while (--index >= start) {
                 struct page **hash = page_hash(mapping, index);
                 spin_lock(&pagecache_lock);
                 page = __find_page_nolock(mapping, index, *hash);
                 spin_unlock(&pagecache_lock);
- if (!page || !PageActive(page))
+ if (!page || !PageActiveCache(page))
                         break;
                 drop_page(page);
         }
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(ALL_ZONES);
+}
+
+/*
+ * Look up a page in the pagecache and return that page with
+ * a reference helt
+ */
+struct page * __find_pagecache_page(struct address_space *mapping,
+ unsigned long offset, struct page **hash)
+{
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, offset, *hash);
+ if (page)
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ return page;
 }
 
+EXPORT_SYMBOL_GPL(__find_pagecache_page);
+
 /* Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed. This routine should
@@ -1092,7 +1120,7 @@
         struct page *page, **hash;
 
         hash = page_hash(mapping, index);
- page = __find_get_page(mapping, index, hash);
+ page = __find_pagecache_page(mapping, index, hash);
 
         if ( page ) {
                 if ( !TryLockPage(page) ) {
@@ -1378,7 +1406,7 @@
         /* Mark the page referenced, AFTER checking for previous usage.. */
         SetPageReferenced(page);
 
- if (unlikely(PageInactiveClean(page))) {
+ if (unlikely(PageInactiveClean(page) || PageInactiveLaundry(page))) {
                 struct zone_struct *zone = page_zone(page);
                 int free = zone->free_pages + zone->inactive_clean_pages;
 
@@ -1899,7 +1927,7 @@
                 nr = max;
 
         /* And limit it to a sane percentage of the inactive list.. */
- max = nr_inactive_clean_pages / 2;
+ max = (nr_inactive_clean_pages() + nr_inactive_laundry_pages()) / 2;
         if (nr > max)
                 nr = max;
 
@@ -2022,7 +2050,7 @@
          */
         hash = page_hash(mapping, pgoff);
 retry_find:
- page = __find_get_page(mapping, pgoff, hash);
+ page = __find_pagecache_page(mapping, pgoff, hash);
         if (!page)
                 goto no_cached_page;
 
@@ -2885,7 +2913,7 @@
         struct page *page, *cached_page = NULL;
         int err;
 repeat:
- page = __find_get_page(mapping, index, hash);
+ page = __find_pagecache_page(mapping, index, hash);
         if (!page) {
                 if (!cached_page) {
                         cached_page = page_cache_alloc(mapping);
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c Tue Nov 26 19:23:38 2002
+++ b/mm/page_alloc.c Tue Nov 26 19:23:38 2002
@@ -26,9 +26,6 @@
 #include <linux/smp.h>
 
 int nr_swap_pages;
-int nr_active_pages;
-int nr_inactive_dirty_pages;
-int nr_inactive_clean_pages;
 pg_data_t *pgdat_list;
 
 /* Used to look up the address of the struct zone encoded in page->zone */
@@ -109,16 +106,19 @@
                 BUG();
         if (PageLocked(page))
                 BUG();
- if (PageActive(page))
+ if (PageActiveAnon(page))
+ BUG();
+ if (PageActiveCache(page))
                 BUG();
         if (PageInactiveDirty(page))
                 BUG();
+ if (PageInactiveLaundry(page))
+ BUG();
         if (PageInactiveClean(page))
                 BUG();
         if (page->pte_chain)
                 BUG();
         page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
- page->age = PAGE_AGE_START;
         
         zone = page_zone(page);
 
@@ -562,7 +562,9 @@
          */
 defragment:
         {
- int freed = 0;
+ int try_harder = 0;
+ unsigned int mask = 0;
+ int numpages;
 defragment_again:
                 zone = zonelist->zones;
                 for (;;) {
@@ -571,6 +573,22 @@
                                 break;
                         if (!z->size)
                                 continue;
+
+ /*
+ * Try to free the zone's inactive laundry pages.
+ * Nonblocking in the first pass; blocking in the
+ * second pass, but never on very new IO.
+ */
+ numpages = z->inactive_laundry_pages;
+ if (try_harder) {
+ numpages /= 2;
+ mask = gfp_mask;
+ }
+
+ current->flags |= PF_MEMALLOC;
+ rebalance_laundry_zone(z, numpages, mask);
+ current->flags &= ~PF_MEMALLOC;
+
                         while (z->inactive_clean_pages) {
                                 struct page * page;
                                 /* Move one page to the free list. */
@@ -585,12 +603,9 @@
                         }
                 }
 
- /* XXX: do real defragmentation instead of calling launder ? */
- if (!freed & !(current->flags & PF_MEMALLOC)) {
- freed = 1;
- current->flags |= PF_MEMALLOC;
- try_to_free_pages(gfp_mask);
- current->flags &= ~PF_MEMALLOC;
+ /* If we can wait for IO to complete, we wait... */
+ if (!try_harder && (gfp_mask & __GFP_FS)) {
+ try_harder = 1;
                         goto defragment_again;
                 }
         }
@@ -641,19 +656,29 @@
 }
 
 /*
- * Total amount of free (allocatable) RAM:
- */
-unsigned int nr_free_pages (void)
-{
- unsigned int sum;
- zone_t *zone;
-
- sum = 0;
- for_each_zone(zone)
- sum += zone->free_pages;
-
- return sum;
-}
+ * These statistics are held in per-zone counters, so we need to loop
+ * over each zone and read the statistics. We use this silly macro
+ * so we don't need to duplicate the code for every statistic.
+ * If you have a better idea on how to implement this (cut'n'paste
+ * isn't considered better), please let me know - Rik
+ */
+#define NR_FOO_PAGES(__function_name, __stat) \
+ unsigned int __function_name (void) \
+ { \
+ unsigned int sum = 0; \
+ zone_t *zone; \
+ \
+ for_each_zone(zone) \
+ sum += zone->__stat; \
+ return sum; \
+ }
+
+NR_FOO_PAGES(nr_free_pages, free_pages)
+NR_FOO_PAGES(nr_active_anon_pages, active_anon_pages)
+NR_FOO_PAGES(nr_active_cache_pages, active_cache_pages)
+NR_FOO_PAGES(nr_inactive_dirty_pages, inactive_dirty_pages)
+NR_FOO_PAGES(nr_inactive_laundry_pages, inactive_laundry_pages)
+NR_FOO_PAGES(nr_inactive_clean_pages, inactive_clean_pages)
 
 /*
  * Amount of free RAM allocatable as buffer memory:
@@ -671,6 +696,7 @@
                 for (zone = *zonep++; zone; zone = *zonep++) {
                         sum += zone->free_pages;
                         sum += zone->inactive_clean_pages;
+ sum += zone->inactive_laundry_pages;
                         sum += zone->inactive_dirty_pages;
                 }
 
@@ -728,10 +754,11 @@
                 nr_free_pages() << (PAGE_SHIFT-10),
                 nr_free_highpages() << (PAGE_SHIFT-10));
 
- printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d )\n",
- nr_active_pages,
- nr_inactive_dirty_pages,
- nr_inactive_clean_pages,
+ printk("( Active: %d/%d, inactive_laundry: %d, inactive_clean: %d, free: %d )\n",
+ nr_active_anon_pages() + nr_active_cache_pages(),
+ nr_inactive_dirty_pages(),
+ nr_inactive_laundry_pages(),
+ nr_inactive_clean_pages(),
                 nr_free_pages());
 
         for (type = 0; type < MAX_NR_ZONES; type++) {
@@ -941,12 +968,25 @@
                 zone->lock = SPIN_LOCK_UNLOCKED;
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
+ zone->active_anon_pages = 0;
+ zone->active_cache_pages = 0;
                 zone->inactive_clean_pages = 0;
+ zone->inactive_laundry_pages = 0;
                 zone->inactive_dirty_pages = 0;
                 zone->need_balance = 0;
- INIT_LIST_HEAD(&zone->active_list);
+ zone->need_scan = 0;
+ for (k = 0; k <= MAX_AGE ; k++) {
+ INIT_LIST_HEAD(&zone->active_anon_list[k]);
+ zone->active_anon_count[k] = 0;
+ }
+ for (k = 0; k <= MAX_AGE ; k++) {
+ INIT_LIST_HEAD(&zone->active_cache_list[k]);
+ zone->active_cache_count[k] = 0;
+ }
                 INIT_LIST_HEAD(&zone->inactive_dirty_list);
+ INIT_LIST_HEAD(&zone->inactive_laundry_list);
                 INIT_LIST_HEAD(&zone->inactive_clean_list);
+ spin_lock_init(&zone->lru_lock);
 
                 if (!size)
                         continue;
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c Tue Nov 26 19:23:38 2002
+++ b/mm/rmap.c Tue Nov 26 19:23:38 2002
@@ -14,7 +14,7 @@
 /*
  * Locking:
  * - the page->pte_chain is protected by the PG_chainlock bit,
- * which nests within the pagemap_lru_lock, then the
+ * which nests within the lru lock, then the
  * mm->page_table_lock, and then the page lock.
  * - because swapout locking is opposite to the locking order
  * in the page fault path, the swapout path uses trylocks
@@ -68,7 +68,7 @@
  */
 int page_referenced(struct page * page, int * rsslimit)
 {
- int referenced = 0, under_rsslimit = 0;
+ int referenced = 0, under_rsslimit = 0, active_process = 0;
         struct mm_struct * mm;
         struct pte_chain * pc;
 
@@ -85,13 +85,17 @@
                 mm = ptep_to_mm(ptep);
                 if (mm->rss < mm->rlimit_rss)
                         under_rsslimit++;
+
+ if (time_before(jiffies, mm->last_run + 5 * HZ))
+ active_process++;
         }
 
         /*
          * We're only over the RSS limit if all the processes sharing the
- * page are.
+ * page are, or if none of the processes has been active for 5
+ * seconds.
          */
- *rsslimit = !under_rsslimit;
+ *rsslimit = !(under_rsslimit && active_process);
 
         return referenced;
 }
@@ -195,7 +199,7 @@
  * table entry mapping a page. Because locking order here is opposite
  * to the locking order used by the page fault path, we use trylocks.
  * Locking:
- * pagemap_lru_lock page_launder()
+ * lru lock page_launder()
  * page lock page_launder(), trylock
  * pte_chain_lock page_launder()
  * mm->page_table_lock try_to_unmap_one(), trylock
@@ -263,7 +267,7 @@
  * @page: the page to get unmapped
  *
  * Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path. Caller must hold pagemap_lru_lock
+ * page, used in the pageout path. Caller must hold lru lock
  * and the page lock. Return values are:
  *
  * SWAP_SUCCESS - we succeeded in removing all mappings
diff -Nru a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c Tue Nov 26 19:23:38 2002
+++ b/mm/shmem.c Tue Nov 26 19:23:38 2002
@@ -581,7 +581,7 @@
          * cache and swap cache. We need to recheck the page cache
          * under the protection of the info->lock spinlock. */
 
- page = find_get_page(mapping, idx);
+ page = find_pagecache_page(mapping, idx);
         if (page) {
                 if (TryLockPage(page))
                         goto wait_retry;
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c Tue Nov 26 19:23:38 2002
+++ b/mm/swap.c Tue Nov 26 19:23:38 2002
@@ -36,7 +36,6 @@
 /**
  * (de)activate_page - move pages from/to active and inactive lists
  * @page: the page we want to move
- * @nolock - are we already holding the pagemap_lru_lock?
  *
  * Deactivate_page will move an active page to the right
  * inactive list, while activate_page will move a page back
@@ -51,18 +50,20 @@
          * (some pages aren't on any list at all)
          */
         ClearPageReferenced(page);
- page->age = 0;
- if (PageActive(page)) {
- del_page_from_active_list(page);
+ if (PageActiveAnon(page)) {
+ del_page_from_active_anon_list(page);
+ add_page_to_inactive_dirty_list(page);
+ } else if (PageActiveCache(page)) {
+ del_page_from_active_cache_list(page);
                 add_page_to_inactive_dirty_list(page);
         }
 }
 
 void deactivate_page(struct page * page)
 {
- spin_lock(&pagemap_lru_lock);
+ lru_lock(page_zone(page));
         deactivate_page_nolock(page);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(page_zone(page));
 }
 
 /**
@@ -74,16 +75,54 @@
  * on the inactive_clean list it is placed on the inactive_dirty list
  * instead.
  *
- * Note: this function gets called with the pagemap_lru_lock held.
+ * Note: this function gets called with the lru lock held.
  */
+void drop_page_zone(struct zone_struct *zone, struct page * page)
+{
+ if (!TryLockPage(page)) {
+ if (page->mapping && page->buffers) {
+ page_cache_get(page);
+ lru_unlock(zone);
+ try_to_release_page(page, GFP_NOIO);
+ lru_lock(zone);
+ page_cache_release(page);
+ }
+ UnlockPage(page);
+ }
+
+ /* Make sure the page really is reclaimable. */
+ pte_chain_lock(page);
+ if (!page->mapping || PageDirty(page) || page->pte_chain ||
+ page->buffers || page_count(page) > 1)
+ deactivate_page_nolock(page);
+
+ else if (page_count(page) == 1) {
+ ClearPageReferenced(page);
+ if (PageActiveAnon(page)) {
+ del_page_from_active_anon_list(page);
+ add_page_to_inactive_clean_list(page);
+ } else if (PageActiveCache(page)) {
+ del_page_from_active_cache_list(page);
+ add_page_to_inactive_clean_list(page);
+ } else if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_clean_list(page);
+ } else if (PageInactiveLaundry(page)) {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_inactive_clean_list(page);
+ }
+ }
+ pte_chain_unlock(page);
+}
+
 void drop_page(struct page * page)
 {
         if (!TryLockPage(page)) {
                 if (page->mapping && page->buffers) {
                         page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(ALL_ZONES);
                         try_to_release_page(page, GFP_NOIO);
- spin_lock(&pagemap_lru_lock);
+ lru_lock(ALL_ZONES);
                         page_cache_release(page);
                 }
                 UnlockPage(page);
@@ -97,13 +136,18 @@
 
         else if (page_count(page) == 1) {
                 ClearPageReferenced(page);
- page->age = 0;
- if (PageActive(page)) {
- del_page_from_active_list(page);
+ if (PageActiveAnon(page)) {
+ del_page_from_active_anon_list(page);
+ add_page_to_inactive_clean_list(page);
+ } else if (PageActiveCache(page)) {
+ del_page_from_active_cache_list(page);
                         add_page_to_inactive_clean_list(page);
                 } else if (PageInactiveDirty(page)) {
                         del_page_from_inactive_dirty_list(page);
                         add_page_to_inactive_clean_list(page);
+ } else if (PageInactiveLaundry(page)) {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_inactive_clean_list(page);
                 }
         }
         pte_chain_unlock(page);
@@ -116,21 +160,21 @@
 {
         if (PageInactiveDirty(page)) {
                 del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
+ add_page_to_active_list(page, INITIAL_AGE);
+ } else if (PageInactiveLaundry(page)) {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_active_list(page, INITIAL_AGE);
         } else if (PageInactiveClean(page)) {
                 del_page_from_inactive_clean_list(page);
- add_page_to_active_list(page);
+ add_page_to_active_list(page, INITIAL_AGE);
         }
-
- /* Make sure the page gets a fair chance at staying active. */
- page->age = max((int)page->age, PAGE_AGE_START);
 }
 
 void activate_page(struct page * page)
 {
- spin_lock(&pagemap_lru_lock);
+ lru_lock(page_zone(page));
         activate_page_nolock(page);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(page_zone(page));
 }
 
 /**
@@ -140,10 +184,10 @@
 void lru_cache_add(struct page * page)
 {
         if (!PageLRU(page)) {
- spin_lock(&pagemap_lru_lock);
+ lru_lock(page_zone(page));
                 SetPageLRU(page);
- add_page_to_active_list(page);
- spin_unlock(&pagemap_lru_lock);
+ add_page_to_active_list(page, INITIAL_AGE);
+ lru_unlock(page_zone(page));
         }
 }
 
@@ -152,14 +196,18 @@
  * @page: the page to add
  *
  * This function is for when the caller already holds
- * the pagemap_lru_lock.
+ * the lru lock.
  */
 void __lru_cache_del(struct page * page)
 {
- if (PageActive(page)) {
- del_page_from_active_list(page);
+ if (PageActiveAnon(page)) {
+ del_page_from_active_anon_list(page);
+ } else if (PageActiveCache(page)) {
+ del_page_from_active_cache_list(page);
         } else if (PageInactiveDirty(page)) {
                 del_page_from_inactive_dirty_list(page);
+ } else if (PageInactiveLaundry(page)) {
+ del_page_from_inactive_laundry_list(page);
         } else if (PageInactiveClean(page)) {
                 del_page_from_inactive_clean_list(page);
         }
@@ -172,9 +220,9 @@
  */
 void lru_cache_del(struct page * page)
 {
- spin_lock(&pagemap_lru_lock);
+ lru_lock(page_zone(page));
         __lru_cache_del(page);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(page_zone(page));
 }
 
 /*
diff -Nru a/mm/swap_state.c b/mm/swap_state.c
--- a/mm/swap_state.c Tue Nov 26 19:23:38 2002
+++ b/mm/swap_state.c Tue Nov 26 19:23:38 2002
@@ -196,7 +196,7 @@
 {
         struct page *found;
 
- found = find_get_page(&swapper_space, entry.val);
+ found = find_pagecache_page(&swapper_space, entry.val);
         /*
          * Unsafe to assert PageSwapCache and mapping on page found:
          * if SMP nothing prevents swapoff from deleting this page from
@@ -224,10 +224,10 @@
                 /*
                  * First check the swap cache. Since this is normally
                  * called after lookup_swap_cache() failed, re-calling
- * that would confuse statistics: use find_get_page()
+ * that would confuse statistics: use find_pagecache_page()
                  * directly.
                  */
- found_page = find_get_page(&swapper_space, entry.val);
+ found_page = find_pagecache_page(&swapper_space, entry.val);
                 if (found_page)
                         break;
 
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c Tue Nov 26 19:23:38 2002
+++ b/mm/vmscan.c Tue Nov 26 19:23:38 2002
@@ -12,6 +12,7 @@
  * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  * Multiqueue VM started 5.8.00, Rik van Riel.
+ * O(1) rmap vm, Arjan van de ven <arjanv@redhat.com>
  */
 
 #include <linux/slab.h>
@@ -37,16 +38,36 @@
  */
 #define DEF_PRIORITY (6)
 
-static inline void age_page_up(struct page *page)
+static inline void age_page_up_nolock(struct page *page, int old_age)
 {
- page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX);
-}
+ int new_age;
+
+ new_age = old_age+4;
+ if (new_age < 0)
+ new_age = 0;
+ if (new_age > MAX_AGE)
+ new_age = MAX_AGE;
+
+ if (PageActiveAnon(page)) {
+ del_page_from_active_anon_list(page);
+ add_page_to_active_anon_list(page, new_age);
+ } else if (PageActiveCache(page)) {
+ del_page_from_active_cache_list(page);
+ add_page_to_active_cache_list(page, new_age);
+ } else if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page, new_age);
+ } else if (PageInactiveLaundry(page)) {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_active_list(page, new_age);
+ } else if (PageInactiveClean(page)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_active_list(page, new_age);
+ } else return;
 
-static inline void age_page_down(struct page *page)
-{
- page->age -= min(PAGE_AGE_DECL, (int)page->age);
 }
 
+
 /* Must be called with page's pte_chain_lock held. */
 static inline int page_mapping_inuse(struct page * page)
 {
@@ -84,9 +105,9 @@
 
         /*
          * We need to hold the pagecache_lock around all tests to make sure
- * reclaim_page() cannot race with find_get_page() and friends.
+ * reclaim_page() doesn't race with other pagecache users
          */
- spin_lock(&pagemap_lru_lock);
+ lru_lock(zone);
         spin_lock(&pagecache_lock);
         maxscan = zone->inactive_clean_pages;
         while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
@@ -94,12 +115,7 @@
                 page = list_entry(page_lru, struct page, lru);
 
                 /* Wrong page on list?! (list corruption, should not happen) */
- if (unlikely(!PageInactiveClean(page))) {
- printk("VM: reclaim_page, wrong page on list.\n");
- list_del(page_lru);
- page_zone(page)->inactive_clean_pages--;
- continue;
- }
+ BUG_ON(unlikely(!PageInactiveClean(page)));
 
                 /* Page is being freed */
                 if (unlikely(page_count(page)) == 0) {
@@ -144,7 +160,7 @@
                 UnlockPage(page);
         }
         spin_unlock(&pagecache_lock);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(zone);
         return NULL;
 
 
@@ -152,11 +168,10 @@
         __lru_cache_del(page);
         pte_chain_unlock(page);
         spin_unlock(&pagecache_lock);
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(zone);
         if (entry.val)
                 swap_free(entry);
         UnlockPage(page);
- page->age = PAGE_AGE_START;
         if (page_count(page) != 1)
                 printk("VM: reclaim_page, found page with count %d!\n",
                                 page_count(page));
@@ -164,458 +179,626 @@
 }
 
 /**
- * page_dirty - do we need to write the data out to disk
- * @page: page to test
+ * need_rebalance_dirty - do we need to write inactive stuff to disk?
+ * @zone: the zone in question
  *
- * Returns true if the page contains data which needs to
- * be written to disk. Doesn't test the page tables (yet?).
+ * Returns true if the zone in question has an inbalance between inactive
+ * dirty on one side and inactive laundry + inactive clean on the other
+ * Right now set the balance at 50%; may need tuning later on
  */
-static inline int page_dirty(struct page *page)
+static inline int need_rebalance_dirty(zone_t * zone)
 {
- struct buffer_head *tmp, *bh;
-
- if (PageDirty(page))
+ if (zone->inactive_dirty_pages > zone->inactive_laundry_pages + zone->inactive_clean_pages)
                 return 1;
 
- if (page->mapping && !page->buffers)
- return 0;
-
- tmp = bh = page->buffers;
-
- do {
- if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
- return 1;
- tmp = tmp->b_this_page;
- } while (tmp != bh);
+ return 0;
+}
 
+/**
+ * need_rebalance_laundry - does the zone have too few inactive_clean pages?
+ * @zone: the zone in question
+ *
+ * Returns true if the zone in question has too few pages in inactive clean
+ * + free
+ */
+static inline int need_rebalance_laundry(zone_t * zone)
+{
+ if (free_low(zone) >= 0)
+ return 1;
         return 0;
 }
 
 /**
- * page_launder_zone - clean dirty inactive pages, move to inactive_clean list
+ * launder_page - clean dirty page, move to inactive_laundry list
  * @zone: zone to free pages in
  * @gfp_mask: what operations we are allowed to do
- * @full_flush: full-out page flushing, if we couldn't get enough clean pages
+ * @page: the page at hand, must be on the inactive dirty list
  *
- * This function is called when we are low on free / inactive_clean
- * pages, its purpose is to refill the free/clean list as efficiently
- * as possible.
- *
- * This means we do writes asynchronously as long as possible and will
- * only sleep on IO when we don't have another option. Since writeouts
- * cause disk seeks and make read IO slower, we skip writes alltogether
- * when the amount of dirty pages is small.
- *
- * This code is heavily inspired by the FreeBSD source code. Thanks
- * go out to Matthew Dillon.
- */
-int page_launder_zone(zone_t * zone, int gfp_mask, int full_flush)
-{
- int maxscan, cleaned_pages, target, maxlaunder, iopages, over_rsslimit;
- struct list_head * entry, * next;
-
- target = max_t(int, free_plenty(zone), zone->pages_min);
- cleaned_pages = iopages = 0;
-
- /* If we can get away with it, only flush 2 MB worth of dirty pages */
- if (full_flush)
- maxlaunder = 1000000;
- else {
- maxlaunder = min_t(int, 512, zone->inactive_dirty_pages / 4);
- maxlaunder = max(maxlaunder, free_plenty(zone) * 4);
- }
-
- /* The main launder loop. */
- spin_lock(&pagemap_lru_lock);
-rescan:
- maxscan = zone->inactive_dirty_pages;
- entry = zone->inactive_dirty_list.prev;
- next = entry->prev;
- while (maxscan-- && !list_empty(&zone->inactive_dirty_list) &&
- next != &zone->inactive_dirty_list) {
- struct page * page;
-
- /* Low latency reschedule point */
- if (current->need_resched) {
- spin_unlock(&pagemap_lru_lock);
- schedule();
- spin_lock(&pagemap_lru_lock);
- continue;
- }
-
- entry = next;
- next = entry->prev;
- page = list_entry(entry, struct page, lru);
-
- /* This page was removed while we looked the other way. */
- if (!PageInactiveDirty(page))
- goto rescan;
+ * per-zone lru lock is assumed to be held, but this function can drop
+ * it and sleep, so no other locks are allowed to be held.
+ *
+ * returns 0 for failure; 1 for success
+ */
+int launder_page(zone_t * zone, int gfp_mask, struct page *page)
+{
+ int over_rsslimit;
 
- if (cleaned_pages > target)
- break;
+ /*
+ * Page is being freed, don't worry about it, but report progress.
+ */
+ if (unlikely(page_count(page)) == 0)
+ return 1;
 
- /* Stop doing IO if we've laundered too many pages already. */
- if (maxlaunder < 0)
- gfp_mask &= ~(__GFP_IO|__GFP_FS);
+ BUG_ON(!PageInactiveDirty(page));
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_laundry_list(page);
+ /* store the time we start IO */
+ page->age = (jiffies/HZ)&255;
+ /*
+ * The page is locked. IO in progress?
+ * If so, move to laundry and report progress
+ * Acquire PG_locked early in order to safely
+ * access page->mapping.
+ */
+ if (unlikely(TryLockPage(page))) {
+ return 1;
+ }
 
- /*
- * Page is being freed, don't worry about it.
- */
- if (unlikely(page_count(page)) == 0)
- continue;
+ /*
+ * The page is in active use or really unfreeable. Move to
+ * the active list and adjust the page age if needed.
+ */
+ pte_chain_lock(page);
+ if (page_referenced(page, &over_rsslimit) && !over_rsslimit &&
+ page_mapping_inuse(page)) {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_active_list(page, INITIAL_AGE);
+ pte_chain_unlock(page);
+ UnlockPage(page);
+ return 1;
+ }
 
- /*
- * The page is locked. IO in progress?
- * Acquire PG_locked early in order to safely
- * access page->mapping.
- */
- if (unlikely(TryLockPage(page))) {
- iopages++;
- continue;
+ /*
+ * Anonymous process memory without backing store. Try to
+ * allocate it some swap space here.
+ *
+ * XXX: implement swap clustering ?
+ */
+ if (page->pte_chain && !page->mapping && !page->buffers) {
+ page_cache_get(page);
+ pte_chain_unlock(page);
+ lru_unlock(zone);
+ if (!add_to_swap(page)) {
+ activate_page(page);
+ lru_lock(zone);
+ UnlockPage(page);
+ page_cache_release(page);
+ return 0;
                 }
-
- /*
- * The page is in active use or really unfreeable. Move to
- * the active list and adjust the page age if needed.
- */
- pte_chain_lock(page);
- if (page_referenced(page, &over_rsslimit) && !over_rsslimit &&
- page_mapping_inuse(page)) {
- del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
- page->age = max((int)page->age, PAGE_AGE_START);
- pte_chain_unlock(page);
+ lru_lock(zone);
+ page_cache_release(page);
+ /* Note: may be on another list ! */
+ if (!PageInactiveLaundry(page)) {
                         UnlockPage(page);
- continue;
+ return 1;
+ }
+ if (unlikely(page_count(page)) == 0) {
+ UnlockPage(page);
+ return 1;
                 }
+ pte_chain_lock(page);
+ }
 
- /*
- * Anonymous process memory without backing store. Try to
- * allocate it some swap space here.
- *
- * XXX: implement swap clustering ?
- */
- if (page->pte_chain && !page->mapping && !page->buffers) {
- /* Don't bother if we can't swap it out now. */
- if (maxlaunder < 0) {
+ /*
+ * The page is mapped into the page tables of one or more
+ * processes. Try to unmap it here.
+ */
+ if (page->pte_chain && page->mapping) {
+ switch (try_to_unmap(page)) {
+ case SWAP_ERROR:
+ case SWAP_FAIL:
+ goto page_active;
+ case SWAP_AGAIN:
                                 pte_chain_unlock(page);
                                 UnlockPage(page);
- list_del(entry);
- list_add(entry, &zone->inactive_dirty_list);
- continue;
- }
- page_cache_get(page);
- pte_chain_unlock(page);
- spin_unlock(&pagemap_lru_lock);
- if (!add_to_swap(page)) {
- activate_page(page);
- UnlockPage(page);
- page_cache_release(page);
- spin_lock(&pagemap_lru_lock);
- continue;
- }
- page_cache_release(page);
- spin_lock(&pagemap_lru_lock);
- pte_chain_lock(page);
+ return 0;
+ case SWAP_SUCCESS:
+ ; /* fall through, try freeing the page below */
+ /* fixme: add a SWAP_MLOCK case */
                 }
+ }
+ pte_chain_unlock(page);
 
+ if (PageDirty(page) && page->mapping) {
                 /*
- * The page is mapped into the page tables of one or more
- * processes. Try to unmap it here.
+ * The page can be dirtied after we start writing, but
+ * in that case the dirty bit will simply be set again
+ * and we'll need to write it again.
                  */
- if (page->pte_chain && page->mapping) {
- switch (try_to_unmap(page)) {
- case SWAP_ERROR:
- case SWAP_FAIL:
- goto page_active;
- case SWAP_AGAIN:
- pte_chain_unlock(page);
- UnlockPage(page);
- continue;
- case SWAP_SUCCESS:
- ; /* try to free the page below */
- }
+ int (*writepage)(struct page *);
+
+ writepage = page->mapping->a_ops->writepage;
+ if ((gfp_mask & __GFP_FS) && writepage) {
+ ClearPageDirty(page);
+ SetPageLaunder(page);
+ page_cache_get(page);
+ lru_unlock(zone);
+
+ writepage(page);
+
+ page_cache_release(page);
+ lru_lock(zone);
+ return 1;
+ } else {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_inactive_dirty_list(page);
+ /* FIXME: this is wrong for !__GFP_FS !!! */
+ UnlockPage(page);
+ return 0;
                 }
- pte_chain_unlock(page);
+ }
 
- if (PageDirty(page) && page->mapping) {
- /*
- * It is not critical here to write it only if
- * the page is unmapped beause any direct writer
- * like O_DIRECT would set the PG_dirty bitflag
- * on the physical page after having successfully
- * pinned it and after the I/O to the page is finished,
- * so the direct writes to the page cannot get lost.
- */
- int (*writepage)(struct page *);
+ /*
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we try to free
+ * the page as well.
+ */
+ if (page->buffers) {
+ /* To avoid freeing our page before we're done. */
+ page_cache_get(page);
+ lru_unlock(zone);
 
- writepage = page->mapping->a_ops->writepage;
- if ((gfp_mask & __GFP_FS) && writepage) {
- ClearPageDirty(page);
- SetPageLaunder(page);
- page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
+ try_to_release_page(page, gfp_mask);
+ UnlockPage(page);
 
- writepage(page);
- maxlaunder--;
- iopages++;
- page_cache_release(page);
+ /*
+ * If the buffers were the last user of the page we free
+ * the page here. Because of that we shouldn't hold the
+ * lru lock yet.
+ */
+ page_cache_release(page);
 
- spin_lock(&pagemap_lru_lock);
- continue;
- } else {
- UnlockPage(page);
- list_del(entry);
- list_add(entry, &zone->inactive_dirty_list);
- continue;
- }
- }
+ lru_lock(zone);
+ return 1;
+ }
 
+ /*
+ * If the page is really freeable now, move it to the
+ * inactive_laundry list to keep LRU order.
+ *
+ * We re-test everything since the page could have been
+ * used by somebody else while we waited on IO above.
+ * This test is not safe from races; only the one in
+ * reclaim_page() needs to be.
+ */
+ pte_chain_lock(page);
+ if (page->mapping && !PageDirty(page) && !page->pte_chain &&
+ page_count(page) == 1) {
+ pte_chain_unlock(page);
+ UnlockPage(page);
+ return 1;
+ } else {
                 /*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we try to free
- * the page as well.
+ * OK, we don't know what to do with the page.
+ * It's no use keeping it here, so we move it
+ * back to the active list.
                  */
- if (page->buffers) {
- /* To avoid freeing our page before we're done. */
- page_cache_get(page);
+ page_active:
+ activate_page_nolock(page);
+ pte_chain_unlock(page);
+ UnlockPage(page);
+ }
+ return 0;
+}
 
- spin_unlock(&pagemap_lru_lock);
 
- if (try_to_release_page(page, gfp_mask)) {
- if (!page->mapping) {
- /*
- * We must not allow an anon page
- * with no buffers to be visible on
- * the LRU, so we unlock the page after
- * taking the lru lock
- */
- spin_lock(&pagemap_lru_lock);
- UnlockPage(page);
- __lru_cache_del(page);
+unsigned char active_age_bias = 0;
 
- /* effectively free the page here */
- page_cache_release(page);
+/* Ages down all pages on the active list */
+/* assumes the lru lock held */
+static inline void kachunk_anon(struct zone_struct * zone)
+{
+ int k;
+ if (!list_empty(&zone->active_anon_list[0]))
+ return;
+ if (!zone->active_anon_pages)
+ return;
 
- cleaned_pages++;
- continue;
- } else {
- /*
- * We freed the buffers but may have
- * slept; undo the stuff we did before
- * try_to_release_page and fall through
- * to the next step.
- * But only if the page is still on the inact. dirty
- * list.
- */
-
- spin_lock(&pagemap_lru_lock);
- /* Check if the page was removed from the list
- * while we looked the other way.
- */
- if (!PageInactiveDirty(page)) {
- page_cache_release(page);
- continue;
- }
- page_cache_release(page);
- }
- } else {
- /* failed to drop the buffers so stop here */
- UnlockPage(page);
- page_cache_release(page);
- maxlaunder--;
- iopages++;
+ for (k = 0; k < MAX_AGE; k++) {
+ list_splice_init(&zone->active_anon_list[k+1], &zone->active_anon_list[k]);
+ zone->active_anon_count[k] = zone->active_anon_count[k+1];
+ zone->active_anon_count[k+1] = 0;
+ }
+
+ active_age_bias++;
+ /* flag this zone as having had activity -> rescan to age up is desired */
+ zone->need_scan++;
+}
+
+static inline void kachunk_cache(struct zone_struct * zone)
+{
+ int k;
+ if (!list_empty(&zone->active_cache_list[0]))
+ return;
+ if (!zone->active_cache_pages)
+ return;
+
+ for (k = 0; k < MAX_AGE; k++) {
+ list_splice_init(&zone->active_cache_list[k+1], &zone->active_cache_list[k]);
+ zone->active_cache_count[k] = zone->active_cache_count[k+1];
+ zone->active_cache_count[k+1] = 0;
+ }
 
- spin_lock(&pagemap_lru_lock);
+ active_age_bias++;
+ /* flag this zone as having had activity -> rescan to age up is desired */
+ zone->need_scan++;
+}
+
+#define BATCH_WORK_AMOUNT 64
+
+/*
+ * returns the active cache ratio relative to the total active list
+ * times 10 (eg. 30% cache returns 3)
+ */
+static inline int cache_ratio(struct zone_struct * zone)
+{
+ if (!zone->size)
+ return 0;
+ return 10 * zone->active_cache_pages / (zone->active_cache_pages +
+ zone->active_anon_pages + 1);
+}
+
+/*
+ * If the active_cache list is more than 20% of all active pages,
+ * we do extra heavy reclaim from this list and less reclaiming of
+ * the active_anon pages.
+ * These arrays are indexed by cache_ratio(), ie 0%, 10%, 20% ... 100%
+ */
+static int active_anon_work[11] = {32, 32, 12, 4, 2, 1, 1, 1, 1, 1, 1};
+static int active_cache_work[11] = {32, 32, 52, 60, 62, 63, 63, 63, 63, 63, 63};
+
+/**
+ * refill_inactive_zone - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
+ *
+ * This function will scan a portion of the active list of a zone to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_zone(struct zone_struct * zone, int priority, int target)
+{
+ int maxscan = (zone->active_anon_pages + zone->active_cache_pages) >> priority;
+ struct list_head * page_lru;
+ struct page * page;
+ int over_rsslimit;
+ int progress = 0;
+ int ratio;
+
+ /* Take the lock while messing with the list... */
+ lru_lock(zone);
+ if (target < BATCH_WORK_AMOUNT)
+ target = BATCH_WORK_AMOUNT;
+
+ ratio = cache_ratio(zone);
+
+ while (maxscan-- && zone->active_anon_pages + zone->active_cache_pages > 0 && target > 0) {
+ int anon_work, cache_work;
+ anon_work = active_anon_work[ratio];
+ cache_work = active_cache_work[ratio];
+
+ while (anon_work-- >= 0 && zone->active_anon_pages) {
+ if (list_empty(&zone->active_anon_list[0])) {
+ kachunk_anon(zone);
                                 continue;
                         }
- }
 
+ page_lru = zone->active_anon_list[0].prev;
+ page = list_entry(page_lru, struct page, lru);
 
- /*
- * If the page is really freeable now, move it to the
- * inactive_clean list.
- *
- * We re-test everything since the page could have been
- * used by somebody else while we waited on IO above.
- * This test is not safe from races, but only the one
- * in reclaim_page() needs to be.
- */
- pte_chain_lock(page);
- if (page->mapping && !PageDirty(page) && !page->pte_chain &&
- page_count(page) == 1) {
- del_page_from_inactive_dirty_list(page);
- add_page_to_inactive_clean_list(page);
+ /* Wrong page on list?! (list corruption, should not happen) */
+ BUG_ON(unlikely(!PageActiveAnon(page)));
+
+ /* Needed to follow page->mapping */
+ if (TryLockPage(page)) {
+ /* The page is already locked. This for sure means
+ * someone is doing stuff with it which makes it
+ * active by definition ;)
+ */
+ del_page_from_active_anon_list(page);
+ add_page_to_active_anon_list(page, INITIAL_AGE);
+ continue;
+ }
+
+ /*
+ * Do aging on the pages.
+ */
+ pte_chain_lock(page);
+ if (page_referenced(page, &over_rsslimit) && !over_rsslimit) {
+ pte_chain_unlock(page);
+ age_page_up_nolock(page, 0);
+ UnlockPage(page);
+ continue;
+ }
                         pte_chain_unlock(page);
+
+ deactivate_page_nolock(page);
+ target--;
+ progress++;
                         UnlockPage(page);
- cleaned_pages++;
- } else {
+ }
+
+ while (cache_work-- >= 0 && zone->active_cache_pages) {
+ if (list_empty(&zone->active_cache_list[0])) {
+ kachunk_cache(zone);
+ continue;
+ }
+
+ page_lru = zone->active_cache_list[0].prev;
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ BUG_ON(unlikely(!PageActiveCache(page)));
+
+ /* Needed to follow page->mapping */
+ if (TryLockPage(page)) {
+ /* The page is already locked. This for sure means
+ * someone is doing stuff with it which makes it
+ * active by definition ;)
+ */
+ del_page_from_active_cache_list(page);
+ add_page_to_active_cache_list(page, INITIAL_AGE);
+ continue;
+ }
+
                         /*
- * OK, we don't know what to do with the page.
- * It's no use keeping it here, so we move it to
- * the active list.
+ * Do aging on the pages.
                          */
-page_active:
- del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
+ pte_chain_lock(page);
+ if (page_referenced(page, &over_rsslimit) && !over_rsslimit) {
+ pte_chain_unlock(page);
+ age_page_up_nolock(page, 0);
+ UnlockPage(page);
+ continue;
+ }
                         pte_chain_unlock(page);
+
+ deactivate_page_nolock(page);
+ target--;
+ progress++;
                         UnlockPage(page);
                 }
         }
- spin_unlock(&pagemap_lru_lock);
+ lru_unlock(zone);
 
- /* Return the number of pages moved to the inactive_clean list. */
- return cleaned_pages + iopages;
+ return progress;
 }
 
-/**
- * page_launder - clean dirty inactive pages, move to inactive_clean list
- * @gfp_mask: what operations we are allowed to do
- *
- * This function iterates over all zones and calls page_launder_zone(),
- * balancing still needs to be added...
- */
-int page_launder(int gfp_mask)
+static int need_active_anon_scan(struct zone_struct * zone)
 {
- struct zone_struct * zone;
- int freed = 0;
+ int low = 0, high = 0;
+ int k;
+ for (k=0; k < MAX_AGE/2; k++)
+ low += zone->active_anon_count[k];
 
- /* Global balancing while we have a global shortage. */
- if (free_high(ALL_ZONES) >= 0)
- for_each_zone(zone)
- if (free_plenty(zone) >= 0)
- freed += page_launder_zone(zone, gfp_mask, 0);
-
- /* Clean up the remaining zones with a serious shortage, if any. */
- for_each_zone(zone)
- if (free_low(zone) >= 0) {
- int fullflush = free_min(zone) > 0;
- freed += page_launder_zone(zone, gfp_mask, fullflush);
- }
+ for (k=MAX_AGE/2; k <= MAX_AGE; k++)
+ high += zone->active_anon_count[k];
+
+ if (high<low)
+ return 1;
+ return 0;
+}
+
+static int need_active_cache_scan(struct zone_struct * zone)
+{
+ int low = 0, high = 0;
+ int k;
+ for (k=0; k < MAX_AGE/2; k++)
+ low += zone->active_cache_count[k];
+
+ for (k=MAX_AGE/2; k <= MAX_AGE; k++)
+ high += zone->active_cache_count[k];
 
- return freed;
+ if (high<low)
+ return 1;
+ return 0;
 }
 
-/**
- * refill_inactive_zone - scan the active list and find pages to deactivate
- * @priority: how much are we allowed to scan
- *
- * This function will scan a portion of the active list of a zone to find
- * unused pages, those pages will then be moved to the inactive list.
+static int scan_active_list(struct zone_struct * zone, int age, int anon)
+{
+ struct list_head * list, *page_lru , *next;
+ struct page * page;
+ int over_rsslimit;
+
+ if (anon)
+ list = &zone->active_anon_list[age];
+ else
+ list = &zone->active_cache_list[age];
+
+ /* Take the lock while messing with the list... */
+ lru_lock(zone);
+ list_for_each_safe(page_lru, next, list) {
+ page = list_entry(page_lru, struct page, lru);
+ pte_chain_lock(page);
+ if (page_referenced(page, &over_rsslimit) && !over_rsslimit)
+ age_page_up_nolock(page, age);
+ pte_chain_unlock(page);
+ }
+ lru_unlock(zone);
+ return 0;
+}
+
+/*
+ * Move max_work pages to the inactive clean list as long as there is a need
+ * for this. If gfp_mask allows it, sleep for IO to finish.
  */
-int refill_inactive_zone(struct zone_struct * zone, int priority)
+int rebalance_laundry_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask)
 {
- int maxscan = zone->active_pages >> priority;
- int nr_deactivated = 0, over_rsslimit;
- int target = inactive_high(zone);
         struct list_head * page_lru;
+ int max_loop;
+ int work_done = 0;
         struct page * page;
 
+ max_loop = max_work;
+ if (max_loop < BATCH_WORK_AMOUNT)
+ max_loop = BATCH_WORK_AMOUNT;
         /* Take the lock while messing with the list... */
- spin_lock(&pagemap_lru_lock);
- while (maxscan-- && !list_empty(&zone->active_list)) {
- page_lru = zone->active_list.prev;
+ lru_lock(zone);
+ while (max_loop-- && !list_empty(&zone->inactive_laundry_list)) {
+ page_lru = zone->inactive_laundry_list.prev;
                 page = list_entry(page_lru, struct page, lru);
 
                 /* Wrong page on list?! (list corruption, should not happen) */
- if (unlikely(!PageActive(page))) {
- printk("VM: refill_inactive, wrong page on list.\n");
- list_del(page_lru);
- nr_active_pages--;
- continue;
- }
-
- /* Needed to follow page->mapping */
+ BUG_ON(unlikely(!PageInactiveLaundry(page)));
+
+ /* TryLock to see if the page IO is done */
                 if (TryLockPage(page)) {
- list_del(page_lru);
- list_add(page_lru, &zone->active_list);
- continue;
+ /*
+ * Page is locked (IO in progress?). If we can sleep,
+ * wait for it to finish, except when we've already
+ * done enough work.
+ */
+ if ((gfp_mask & __GFP_WAIT) && (work_done < max_work)) {
+ int timed_out;
+
+ page_cache_get(page);
+ lru_unlock(zone);
+ run_task_queue(&tq_disk);
+ timed_out = wait_on_page_timeout(page, 5 * HZ);
+ lru_lock(zone);
+ page_cache_release(page);
+ /*
+ * If we timed out and the page has been in
+ * flight for over 30 seconds, this might not
+ * be the best page to wait on; move it to
+ * the head of the dirty list.
+ */
+ if (timed_out & PageInactiveLaundry(page)) {
+ unsigned char now;
+ now = (jiffies/HZ)&255;
+ if (now - page->age > 30) {
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_inactive_dirty_list(page);
+ }
+ continue;
+ }
+ /* We didn't make any progress for our caller,
+ * but we are actively avoiding a livelock
+ * so undo the decrement and wait on this page
+ * some more, until IO finishes or we timeout.
+ */
+ max_loop++;
+ continue;
+ } else
+ /* No dice, we can't wait for IO */
+ break;
                 }
+ UnlockPage(page);
 
                 /*
- * If the object the page is in is not in use we don't
- * bother with page aging. If the page is touched again
- * while on the inactive_clean list it'll be reactivated.
- * From here until the end of the current iteration
- * both PG_locked and the pte_chain_lock are held.
+ * If we get here either the IO on the page is done or
+ * IO never happened because it was clean. Either way
+ * move it to the inactive clean list.
                  */
- pte_chain_lock(page);
- if (!page_mapping_inuse(page)) {
- pte_chain_unlock(page);
- UnlockPage(page);
- drop_page(page);
- continue;
- }
+
+ /* FIXME: check if the page is still clean or is accessed ? */
+
+ del_page_from_inactive_laundry_list(page);
+ add_page_to_inactive_clean_list(page);
+ work_done++;
 
                 /*
- * Do aging on the pages.
+ * If we've done the minimal batch of work and there's
+ * no longer a need to rebalance, abort now.
                  */
- if (page_referenced(page, &over_rsslimit)) {
- age_page_up(page);
- } else {
- age_page_down(page);
- }
+ if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_laundry(zone)))
+ break;
+ }
 
- /*
- * If the page age is 'hot' and the process using the
- * page doesn't exceed its RSS limit we keep the page.
- * Otherwise we move it to the inactive_dirty list.
+ lru_unlock(zone);
+ return work_done;
+}
+
+/*
+ * Move max_work pages from the dirty list as long as there is a need.
+ * Start IO if the gfp_mask allows it.
+ */
+int rebalance_dirty_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask)
+{
+ struct list_head * page_lru;
+ int max_loop;
+ int work_done = 0;
+ struct page * page;
+
+ max_loop = max_work;
+ if (max_loop < BATCH_WORK_AMOUNT)
+ max_loop = BATCH_WORK_AMOUNT;
+ /* Take the lock while messing with the list... */
+ lru_lock(zone);
+ while (max_loop-- && !list_empty(&zone->inactive_dirty_list)) {
+ page_lru = zone->inactive_dirty_list.prev;
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ BUG_ON(unlikely(!PageInactiveDirty(page)));
+
+ /*
+ * Note: launder_page() sleeps so we can't safely look at
+ * the page after this point!
+ *
+ * If we fail (only happens if we can't do IO) we just try
+ * again on another page; launder_page makes sure we won't
+ * see the same page over and over again.
                  */
- if (page->age && !over_rsslimit) {
- list_del(page_lru);
- list_add(page_lru, &zone->active_list);
- } else {
- deactivate_page_nolock(page);
- if (++nr_deactivated > target) {
- pte_chain_unlock(page);
- UnlockPage(page);
- goto done;
- }
- }
- pte_chain_unlock(page);
- UnlockPage(page);
+ if (!launder_page(zone, gfp_mask, page))
+ continue;
 
- /* Low latency reschedule point */
- if (current->need_resched) {
- spin_unlock(&pagemap_lru_lock);
- schedule();
- spin_lock(&pagemap_lru_lock);
- }
+ work_done++;
+
+ /*
+ * If we've done the minimal batch of work and there's
+ * no longer any need to rebalance, abort now.
+ */
+ if ((work_done > BATCH_WORK_AMOUNT) && (!need_rebalance_dirty(zone)))
+ break;
         }
+ lru_unlock(zone);
+
+ return work_done;
+}
+
+/* goal percentage sets the goal of the laundry+clean+free of the total zone size */
+int rebalance_inactive_zone(struct zone_struct * zone, int max_work, unsigned int gfp_mask, int goal_percentage)
+{
+ int ret = 0;
+ /* first deactivate memory */
+ if (((zone->inactive_laundry_pages + zone->inactive_clean_pages + zone->free_pages)*100 < zone->size * goal_percentage) &&
+ (inactive_high(zone) > 0))
+ refill_inactive_zone(zone, 0, max_work + BATCH_WORK_AMOUNT);
+
+ if (need_rebalance_dirty(zone))
+ ret += rebalance_dirty_zone(zone, max_work, gfp_mask);
+ if (need_rebalance_laundry(zone))
+ ret += rebalance_laundry_zone(zone, max_work, gfp_mask);
 
-done:
- spin_unlock(&pagemap_lru_lock);
+ /* These pages will become freeable, let the OOM detection know */
+ ret += zone->inactive_laundry_pages;
 
- return nr_deactivated;
+ return ret;
 }
 
-/**
- * refill_inactive - checks all zones and refills the inactive list as needed
- *
- * This function tries to balance page eviction from all zones by aging
- * the pages from each zone in the same ratio until the global inactive
- * shortage is resolved. After that it does one last "clean-up" scan to
- * fix up local inactive shortages.
- */
-int refill_inactive(void)
+int rebalance_inactive(unsigned int gfp_mask, int percentage)
 {
- int maxtry = 1 << DEF_PRIORITY;
- zone_t * zone;
+ struct zone_struct * zone;
+ int max_work;
         int ret = 0;
 
- /* Global balancing while we have a global shortage. */
- while (maxtry-- && inactive_low(ALL_ZONES) >= 0) {
- for_each_zone(zone) {
- if (inactive_high(zone) >= 0)
- ret += refill_inactive_zone(zone, DEF_PRIORITY);
- }
- }
+ max_work = 4 * BATCH_WORK_AMOUNT;
+ /* If we're in deeper trouble, do more work */
+ if (percentage >= 50)
+ max_work = 8 * BATCH_WORK_AMOUNT;
 
- /* Local balancing for zones which really need it. */
- for_each_zone(zone) {
- if (inactive_min(zone) >= 0)
- ret += refill_inactive_zone(zone, 0);
- }
+ for_each_zone(zone)
+ ret += rebalance_inactive_zone(zone, max_work, gfp_mask, percentage);
+ /* 4 * BATCH_WORK_AMOUNT needs tuning */
 
         return ret;
 }
@@ -636,7 +819,9 @@
 
         for_each_zone(zone)
                 if (inactive_high(zone) > 0)
- refill_inactive_zone(zone, priority);
+ refill_inactive_zone(zone, priority, BATCH_WORK_AMOUNT);
+ for_each_zone(zone)
+ rebalance_dirty_zone(zone, BATCH_WORK_AMOUNT, GFP_KSWAPD);
 }
 
 /*
@@ -655,18 +840,13 @@
          * Eat memory from filesystem page cache, buffer cache,
          * dentry, inode and filesystem quota caches.
          */
- ret += page_launder(gfp_mask);
+ ret += rebalance_inactive(gfp_mask, 100);
         ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
         ret += shrink_icache_memory(1, gfp_mask);
 #ifdef CONFIG_QUOTA
         ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif
 
- /*
- * Move pages from the active list to the inactive list.
- */
- refill_inactive();
-
         /*
          * Reclaim unused slab cache memory.
          */
@@ -682,12 +862,54 @@
          * Hmm.. Cache shrink failed - time to kill something?
          * Mhwahahhaha! This is the part I really like. Giggle.
          */
- if (ret < free_low(ANY_ZONE))
+ if (ret < free_low(ANY_ZONE) && (gfp_mask&__GFP_WAIT))
                 out_of_memory();
 
         return ret;
 }
 
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages_kswapd(unsigned int gfp_mask)
+{
+ int ret = 0;
+ struct zone_struct * zone;
+
+ ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+ ret += shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+#ifdef CONFIG_QUOTA
+ ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+#endif
+
+ /*
+ * Eat memory from filesystem page cache, buffer cache,
+ * dentry, inode and filesystem quota caches.
+ */
+ rebalance_inactive(gfp_mask, 5);
+
+ for_each_zone(zone)
+ while (need_rebalance_dirty(zone))
+ rebalance_dirty_zone(zone, 16 * BATCH_WORK_AMOUNT, gfp_mask);
+
+ for_each_zone(zone)
+ if (free_high(zone)>0)
+ rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0);
+
+ refill_freelist();
+
+ /* Start IO when needed. */
+ if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+ run_task_queue(&tq_disk);
+
+ return ret;
+}
+
 /**
  * refill_freelist - move inactive_clean pages to free list if needed
  *
@@ -764,7 +986,7 @@
                  * zone is very short on free pages.
                  */
                 if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
- do_try_to_free_pages(GFP_KSWAPD);
+ do_try_to_free_pages_kswapd(GFP_KSWAPD);
 
                 refill_freelist();
 
@@ -846,7 +1068,7 @@
         /* OK, the VM is very loaded. Sleep instead of using all CPU. */
         kswapd_overloaded = 1;
         set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ / 4);
+ schedule_timeout(HZ / 40);
         kswapd_overloaded = 0;
         return;
 }
@@ -888,6 +1110,7 @@
 void rss_free_pages(unsigned int gfp_mask)
 {
         long pause = 0;
+ struct zone_struct * zone;
 
         if (current->flags & PF_MEMALLOC)
                 return;
@@ -895,7 +1118,10 @@
         current->flags |= PF_MEMALLOC;
 
         do {
- page_launder(gfp_mask);
+ rebalance_inactive(gfp_mask, 100);
+ for_each_zone(zone)
+ if (free_plenty(zone) >= 0)
+ rebalance_laundry_zone(zone, BATCH_WORK_AMOUNT, 0);
 
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 schedule_timeout(pause);
@@ -907,11 +1133,78 @@
         return;
 }
 
+/*
+ * The background page scanning daemon, started as a kernel thread
+ * from the init process.
+ *
+ * This is the part that background scans the active list to find
+ * pages that are referenced and increases their age score.
+ * It is important that this scan rate is not proportional to vm pressure
+ * per se otherwise cpu usage becomes unbounded. On the other hand, if there's
+ * no VM pressure at all it shouldn't age stuff either otherwise everything
+ * ends up at the maximum age.
+ */
+#define MAX_AGING_INTERVAL 5*HZ
+#define MIN_AGING_INTERVAL HZ/2
+int kscand(void *unused)
+{
+ struct task_struct *tsk = current;
+ struct zone_struct * zone;
+ unsigned long pause = MAX_AGING_INTERVAL;
+ int total_needscan = 0;
+ int age_faster = 0;
+ int num_zones = 0;
+ int age;
+
+ daemonize();
+ strcpy(tsk->comm, "kscand");
+ sigfillset(&tsk->blocked);
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(pause);
+ for_each_zone(zone) {
+ if (need_active_anon_scan(zone)) {
+ for (age = 0; age < MAX_AGE; age++) {
+ scan_active_list(zone, age, 1);
+ if (current->need_resched)
+ schedule();
+ }
+ }
+
+ if (need_active_cache_scan(zone)) {
+ for (age = 0; age < MAX_AGE; age++) {
+ scan_active_list(zone, age, 0);
+ if (current->need_resched)
+ schedule();
+ }
+ }
+
+ /* Check if we've been aging quickly enough */
+ if (zone->need_scan >= 2)
+ age_faster++;
+ total_needscan += zone->need_scan;
+ zone->need_scan = 0;
+ num_zones++;
+ }
+ if (age_faster)
+ pause = max(pause / 2, MIN_AGING_INTERVAL);
+ else if (total_needscan < num_zones)
+ pause = min(pause + pause / 2, MAX_AGING_INTERVAL);
+
+ total_needscan = 0;
+ age_faster = 0;
+ num_zones = 0;
+ }
+}
+
+
 static int __init kswapd_init(void)
 {
         printk("Starting kswapd\n");
         swap_setup();
         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+ kernel_thread(kscand, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
         return 0;
 }
 

-- 
Paul P 'Stingray' Komkoff 'Greatest' Jr /// (icq)23200764 /// (http)stingr.net
  When you're invisible, the only one really watching you is you (my keychain)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sat Nov 30 2002 - 22:00:14 EST