PATCH: Improvements in shrink_mmap and kswapd

From: Juan J. Quintela (quintela@fi.udc.es)
Date: Sat Jun 17 2000 - 17:45:52 EST


Hi
        this patch makes kswapd use less resources. It should solve
the kswapd eats xx% of my CPU problems. It appears that it improves
IO a bit here. Could people having problems with IO told me if this
patch improves things, I am interested in knowing that it don't makes
things worst never. This patch is stable here. I am finishing the
deferred mmaped pages form file writing patch, that should solve
several other problems.

Reports of success/failure are welcome. Comments are also welcome.

Later, Juan.

        This patch implements:
- never loops infinitely is shrink_mmap (it walks as maximum once per
  page)
- it changes the nr_dirty logic to max_launder_page logic. We start
  writing async a maximum of max_launder_page (100), and after that
  point we never start more writes for that run of shrink_mmap. If we
  start max_launder_page writes, we wait at the end of the function if
  possible (i.e __gfp_mask let do that).
- It checks that there is some zone with need of pages before continue
  with the loop. If there is no pages, stop walking the LRU.
- I have got the patch from Roger Larson for the memory pressure and
  have partially re implemented/increasing it.
- kswapd rewrite in similar way that Roger Larson one.
- added the function memory_pressure that returns 1 if there is
  memory_pressure and 0 if there is no pressure.
- I have got Manfred patch to use test_and_test_and_clear_bit
  optimization in ClearPageReferenced.
- Added ClearPageDirty(page) to __remove_inode_pages to solve the
  ramfs problems.
- Added __lru_cache_del and __lru_cache_add and use them in
  shrink_mmap.
- Makes a cleanup of several cruft in shirk_mmap.

diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/include/asm-i386/bitops.h working/include/asm-i386/bitops.h
--- base/include/asm-i386/bitops.h Sat Jun 17 23:37:03 2000
+++ working/include/asm-i386/bitops.h Sat Jun 17 23:52:49 2000
@@ -29,6 +29,7 @@
 extern void change_bit(int nr, volatile void * addr);
 extern int test_and_set_bit(int nr, volatile void * addr);
 extern int test_and_clear_bit(int nr, volatile void * addr);
+extern int test_and_test_and_clear_bit(int nr, volatile void * addr);
 extern int test_and_change_bit(int nr, volatile void * addr);
 extern int __constant_test_bit(int nr, const volatile void * addr);
 extern int __test_bit(int nr, volatile void * addr);
@@ -87,6 +88,13 @@
                 :"=r" (oldbit),"=m" (ADDR)
                 :"Ir" (nr));
         return oldbit;
+}
+
+extern __inline__ int test_and_test_and_clear_bit(int nr, volatile void *addr)
+{
+ if(!(((unsigned long)addr) & (1<<nr)))
+ return 0;
+ return test_and_clear_bit(nr,addr);
 }
 
 extern __inline__ int test_and_change_bit(int nr, volatile void * addr)
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/include/linux/mm.h working/include/linux/mm.h
--- base/include/linux/mm.h Sat Jun 17 23:37:03 2000
+++ working/include/linux/mm.h Sun Jun 18 00:23:05 2000
@@ -203,7 +203,7 @@
 #define PageReferenced(page) test_bit(PG_referenced, &(page)->flags)
 #define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags)
 #define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags)
-#define PageTestandClearReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags)
+#define PageTestandClearReferenced(page) test_and_test_and_clear_bit(PG_referenced, &(page)->flags)
 #define PageDecrAfter(page) test_bit(PG_decr_after, &(page)->flags)
 #define SetPageDecrAfter(page) set_bit(PG_decr_after, &(page)->flags)
 #define PageTestandClearDecrAfter(page) test_and_clear_bit(PG_decr_after, &(page)->flags)
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/include/linux/swap.h working/include/linux/swap.h
--- base/include/linux/swap.h Sat Jun 17 23:37:16 2000
+++ working/include/linux/swap.h Sat Jun 17 23:52:49 2000
@@ -87,6 +87,7 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask);
+extern int memory_pressure(void);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
@@ -173,11 +174,17 @@
 /*
  * Helper macros for lru_pages handling.
  */
-#define lru_cache_add(page) \
+
+#define __lru_cache_add(page) \
 do { \
- spin_lock(&pagemap_lru_lock); \
         list_add(&(page)->lru, &lru_cache); \
         nr_lru_pages++; \
+} while (0)
+
+#define lru_cache_add(page) \
+do { \
+ spin_lock(&pagemap_lru_lock); \
+ __lru_cache_add(page); \
         page->age = PG_AGE_START; \
         ClearPageReferenced(page); \
         SetPageActive(page); \
@@ -187,7 +194,6 @@
 #define __lru_cache_del(page) \
 do { \
         list_del(&(page)->lru); \
- ClearPageActive(page); \
         nr_lru_pages--; \
 } while (0)
 
@@ -196,6 +202,7 @@
         if (!PageLocked(page)) \
                 BUG(); \
         spin_lock(&pagemap_lru_lock); \
+ ClearPageActive(page); \
         __lru_cache_del(page); \
         spin_unlock(&pagemap_lru_lock); \
 } while (0)
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/mm/filemap.c working/mm/filemap.c
--- base/mm/filemap.c Sat Jun 17 23:25:43 2000
+++ working/mm/filemap.c Sun Jun 18 00:36:19 2000
@@ -65,8 +65,8 @@
                 (*p)->pprev_hash = &page->next_hash;
         *p = page;
         page->pprev_hash = p;
- if (page->buffers)
- PAGE_BUG(page);
+// if (page->buffers)
+// PAGE_BUG(page);
 }
 
 static inline void remove_page_from_hash_queue(struct page * page)
@@ -102,6 +102,7 @@
         if (page->buffers)
                 BUG();
 
+ ClearPageDirty(page);
         remove_page_from_inode_queue(page);
         remove_page_from_hash_queue(page);
         page->mapping = NULL;
@@ -294,36 +295,55 @@
         spin_unlock(&pagecache_lock);
 }
 
-/*
- * nr_dirty represents the number of dirty pages that we will write async
- * before doing sync writes. We can only do sync writes if we can
- * wait for IO (__GFP_IO set).
+/**
+ * shrink_mmap - Tries to free memory
+ * @priority: how hard we will try to free pages (0 hardest)
+ * @gfp_mask: Restrictions to free pages
+ *
+ * This function walks the lru list searching for free pages. It
+ * returns 1 to indicate success and 0 in the opposite case. It gets a
+ * lock in the pagemap_lru_lock and the pagecache_lock.
  */
+/* nr_to_examinate counts the number of pages that we will read as
+ * maximum as each call. This means that we don't loop.
+ */
+/* nr_writes counts the number of writes that we have started to the
+ * moment. We limitate the number of writes in each round to
+ * max_page_launder. ToDo: Make that variable tunable through sysctl.
+ */
+const int max_page_launder = 100;
+
 int shrink_mmap(int priority, int gfp_mask)
 {
- int ret = 0, count, nr_dirty;
         struct list_head * page_lru;
         struct page * page = NULL;
-
- count = nr_lru_pages / (priority + 1);
- nr_dirty = priority;
+ int ret;
+ int nr_to_examinate = nr_lru_pages;
+ int nr_writes = 0;
+ int count = nr_lru_pages / (priority + 1);
 
         /* we need pagemap_lru_lock for list_del() ... subtle code below */
         spin_lock(&pagemap_lru_lock);
         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+ /* We exit if we have examinated all the LRU pages */
+ if(!nr_to_examinate--)
+ break;
+
+ /* if there is no zone low on memory we return */
+ if(!memory_pressure())
+ break;
+
                 page = list_entry(page_lru, struct page, lru);
- list_del(page_lru);
+ __lru_cache_del(page);
 
                 if (PageTestandClearReferenced(page)) {
- page->age += PG_AGE_ADV;
- if (page->age > PG_AGE_MAX)
- page->age = PG_AGE_MAX;
- goto dispose_continue;
+ page->age = min(PG_AGE_MAX, page->age + PG_AGE_ADV);
+ goto reinsert_page_continue;
                 }
                 page->age -= min(PG_AGE_DECL, page->age);
 
                 if (page->age)
- goto dispose_continue;
+ goto reinsert_page_continue;
 
                 count--;
                 /*
@@ -331,16 +351,18 @@
                  * immediate tell are untouchable..
                  */
                 if (!page->buffers && page_count(page) > 1)
- goto dispose_continue;
+ goto reinsert_page_continue;
 
                 if (TryLockPage(page))
- goto dispose_continue;
+ goto reinsert_page_continue;
 
- /* Release the pagemap_lru lock even if the page is not yet
- queued in any lru queue since we have just locked down
- the page so nobody else may SMP race with us running
- a lru_cache_del() (lru_cache_del() always run with the
- page locked down ;). */
+ /*
+ * Release the pagemap_lru lock even if the page is
+ * not yet queued in any lru queue since we have just
+ * locked down the page so nobody else may SMP race
+ * with us running a lru_cache_del() (lru_cache_del()
+ * always run with the page locked down ;).
+ */
                 spin_unlock(&pagemap_lru_lock);
 
                 /* avoid freeing the page while it's locked */
@@ -351,14 +373,17 @@
                  * of zone - it's old.
                  */
                 if (page->buffers) {
- int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
- if (!try_to_free_buffers(page, wait))
+ if (nr_writes < max_page_launder) {
+ nr_writes++;
+ if (!try_to_free_buffers(page, 0))
+ goto unlock_continue;
+ /* page was locked, inode can't go away under us */
+ if (!page->mapping) {
+ atomic_dec(&buffermem_pages);
+ goto made_buffer_progress;
+ }
+ } else
                                 goto unlock_continue;
- /* page was locked, inode can't go away under us */
- if (!page->mapping) {
- atomic_dec(&buffermem_pages);
- goto made_buffer_progress;
- }
                 }
 
                 /*
@@ -371,10 +396,13 @@
                         goto unlock_continue;
                 }
 
- /* Take the pagecache_lock spinlock held to avoid
- other tasks to notice the page while we are looking at its
- page count. If it's a pagecache-page we'll free it
- in one atomic transaction after checking its page count. */
+ /*
+ * Take the pagecache_lock spinlock held to avoid
+ * other tasks to notice the page while we are
+ * looking at its page count. If it's a
+ * pagecache-page we'll free it in one atomic
+ * transaction after checking its page count.
+ */
                 spin_lock(&pagecache_lock);
 
                 /*
@@ -396,14 +424,15 @@
                                 goto made_inode_progress;
                         }
                         /* PageDeferswap -> we swap out the page now. */
- if (gfp_mask & __GFP_IO) {
+ if ((gfp_mask & __GFP_IO) && (nr_writes < max_page_launder)) {
                                 spin_unlock(&pagecache_lock);
+ nr_writes++;
                                 /* Do NOT unlock the page ... brw_page does. */
                                 ClearPageDirty(page);
                                 rw_swap_page(WRITE, page, 0);
                                 spin_lock(&pagemap_lru_lock);
                                 page_cache_release(page);
- goto dispose_continue;
+ goto reinsert_page_continue;
                         }
                         goto cache_unlock_continue;
                 }
@@ -426,23 +455,23 @@
                 spin_lock(&pagemap_lru_lock);
                 UnlockPage(page);
                 page_cache_release(page);
-dispose_continue:
- list_add(page_lru, &lru_cache);
+reinsert_page_continue:
+ __lru_cache_add(page);
         }
+ spin_unlock(&pagemap_lru_lock);
+ ret = 0;
         goto out;
 
 made_inode_progress:
         page_cache_release(page);
 made_buffer_progress:
+ ClearPageActive(page);
         UnlockPage(page);
         page_cache_release(page);
         ret = 1;
- spin_lock(&pagemap_lru_lock);
- /* nr_lru_pages needs the spinlock */
- nr_lru_pages--;
-
 out:
- spin_unlock(&pagemap_lru_lock);
+ if ((gfp_mask & __GFP_IO) && (nr_writes >= max_page_launder))
+ block_sync_page(page);
 
         return ret;
 }
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/mm/swap_state.c working/mm/swap_state.c
--- base/mm/swap_state.c Sat Jun 17 23:25:43 2000
+++ working/mm/swap_state.c Sat Jun 17 23:52:49 2000
@@ -73,7 +73,6 @@
                 PAGE_BUG(page);
 
         PageClearSwapCache(page);
- ClearPageDirty(page);
         remove_inode_page(page);
 }
 
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/mm/vmscan.c working/mm/vmscan.c
--- base/mm/vmscan.c Sat Jun 17 23:51:24 2000
+++ working/mm/vmscan.c Sun Jun 18 00:28:12 2000
@@ -179,16 +179,14 @@
 
         /* Add it to the swap cache */
         add_to_swap_cache(page, entry);
+ set_pte(page_table, swp_entry_to_pte(entry));
 
         /* Put the swap entry into the pte after the page is in swapcache */
         vma->vm_mm->rss--;
- set_pte(page_table, swp_entry_to_pte(entry));
         flush_tlb_page(vma, address);
         vmlist_access_unlock(vma->vm_mm);
 
- /* OK, do a physical asynchronous write to swap. */
- // rw_swap_page(WRITE, page, 0);
- /* Let shrink_mmap handle this swapout. */
+ /* Set page for deferred swap */
         SetPageDirty(page);
         UnlockPage(page);
 
@@ -427,6 +425,32 @@
         return __ret;
 }
 
+/**
+ * memory_pressure - Is the system under memory pressure
+ *
+ * Returns 1 if the system is low on memory in any of its zones,
+ * otherwise returns 0.
+ */
+int memory_pressure(void)
+{
+ pg_data_t *pgdat = pgdat_list;
+
+ do {
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones + i;
+ if (!zone->size || !zone->zone_wake_kswapd)
+ continue;
+ if (zone->free_pages < zone->pages_low)
+ return 1;
+ }
+ pgdat = pgdat->node_next;
+
+ } while (pgdat);
+
+ return 0;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -444,7 +468,6 @@
         int priority;
         int count = FREE_COUNT;
         int swap_count = 0;
- int ret = 0;
 
         /* Always trim SLAB caches when memory gets low. */
         kmem_cache_reap(gfp_mask);
@@ -452,11 +475,12 @@
         priority = 64;
         do {
                 while (shrink_mmap(priority, gfp_mask)) {
- ret = 1;
                         if (!--count)
                                 goto done;
                 }
 
+ if(!memory_pressure())
+ return 1;
 
                 /* Try to get rid of some shared memory pages.. */
                 if (gfp_mask & __GFP_IO) {
@@ -468,11 +492,9 @@
                         count -= shrink_dcache_memory(priority, gfp_mask);
                         count -= shrink_icache_memory(priority, gfp_mask);
                         if (count <= 0) {
- ret = 1;
                                 goto done;
                         }
                         while (shm_swap(priority, gfp_mask)) {
- ret = 1;
                                 if (!--count)
                                         goto done;
                         }
@@ -496,18 +518,19 @@
                         if (--swap_count < 0)
                                 break;
                 }
+ if(!memory_pressure())
+ return 1;
 
         } while (--priority >= 0);
 
         /* Always end on a shrink_mmap.. */
         while (shrink_mmap(0, gfp_mask)) {
- ret = 1;
                 if (!--count)
                         goto done;
         }
 
 done:
- return ret;
+ return (count < FREE_COUNT);
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
@@ -549,26 +572,14 @@
         tsk->flags |= PF_MEMALLOC;
 
         for (;;) {
- pg_data_t *pgdat;
- int something_to_do = 0;
+ int pressure = memory_pressure();
 
- pgdat = pgdat_list;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones+ i;
- if (tsk->need_resched)
- schedule();
- if (!zone->size || !zone->zone_wake_kswapd)
- continue;
- if (zone->free_pages < zone->pages_low)
- something_to_do = 1;
- do_try_to_free_pages(GFP_KSWAPD);
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+ if (tsk->need_resched)
+ schedule();
 
- if (!something_to_do) {
+ if(pressure)
+ do_try_to_free_pages(GFP_KSWAPD);
+ else {
                         tsk->state = TASK_INTERRUPTIBLE;
                         interruptible_sleep_on(&kswapd_wait);
                 }

-- 
In theory, practice and theory are the same, but in practice they 
are different -- Larry McVoy

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Fri Jun 23 2000 - 21:00:15 EST