Hi,
this patch (versus 2.4.0-test1-ac7) fixes the last balancing
problems with virtual memory. It adds two negative feedback
loops, one in __alloc_pages to make sure kswapd is woken up
often enough but not too often and another one in
do_try_to_free_pages, to balance between the memory freed and
the amount of pages unmapped to "generate" more freeable memory.
This one seems to really work, but of course I'm interested in
feedback ;)
regards,
Rik
-- The Internet is not a network of computers. It is a network of people. That is its real strength.Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies http://www.conectiva.com/ http://www.surriel.com/
--- linux-2.4.0-t1-ac7/fs/buffer.c.orig Thu Jun 1 10:37:59 2000 +++ linux-2.4.0-t1-ac7/fs/buffer.c Thu Jun 1 14:51:14 2000 @@ -1868,6 +1868,7 @@ } spin_unlock(&unused_list_lock); + wake_up(&buffer_wait); return iosize; } @@ -2004,6 +2005,8 @@ __put_unused_buffer_head(bh[bhind]); } spin_unlock(&unused_list_lock); + wake_up(&buffer_wait); + goto finished; } @@ -2181,6 +2184,12 @@ } /* + * Can the buffer be thrown out? + */ +#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected)) +#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + +/* * Sync all the buffers on one page.. * * If we have old buffers that are locked, we'll @@ -2190,7 +2199,7 @@ * This all is required so that we can free up memory * later. */ -static void sync_page_buffers(struct buffer_head *bh, int wait) +static int sync_page_buffers(struct buffer_head *bh, int wait) { struct buffer_head * tmp = bh; @@ -2203,13 +2212,17 @@ } else if (buffer_dirty(p)) ll_rw_block(WRITE, 1, &p); } while (tmp != bh); -} -/* - * Can the buffer be thrown out? - */ -#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected)) -#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + do { + struct buffer_head *p = tmp; + tmp = tmp->b_this_page; + if (buffer_busy(p)) + return 0; + } while (tmp != bh); + + /* Success. Now try_to_free_buffers can free the page. */ + return 1; +} /* * try_to_free_buffers() checks if all the buffers on this particular page @@ -2227,6 +2240,7 @@ struct buffer_head * tmp, * bh = page->buffers; int index = BUFSIZE_INDEX(bh->b_size); +again: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); spin_lock(&free_list[index].lock); @@ -2272,7 +2286,8 @@ spin_unlock(&free_list[index].lock); write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); - sync_page_buffers(bh, wait); + if (sync_page_buffers(bh, wait)) + goto again; return 0; } --- linux-2.4.0-t1-ac7/mm/vmscan.c.orig Wed May 31 14:08:50 2000 +++ linux-2.4.0-t1-ac7/mm/vmscan.c Sat Jun 3 10:29:54 2000 @@ -439,12 +439,12 @@ * latency. */ #define FREE_COUNT 8 -#define SWAP_COUNT 16 static int do_try_to_free_pages(unsigned int gfp_mask) { int priority; int count = FREE_COUNT; - int swap_count; + int swap_count = 0; + int ret = 0; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); @@ -452,6 +452,7 @@ priority = 64; do { while (shrink_mmap(priority, gfp_mask)) { + ret = 1; if (!--count) goto done; } @@ -466,9 +467,12 @@ */ count -= shrink_dcache_memory(priority, gfp_mask); count -= shrink_icache_memory(priority, gfp_mask); - if (count <= 0) + if (count <= 0) { + ret = 1; goto done; + } while (shm_swap(priority, gfp_mask)) { + ret = 1; if (!--count) goto done; } @@ -480,24 +484,30 @@ * This will not actually free any pages (they get * put in the swap cache), so we must not count this * as a "count" success. + * + * The amount we page out is the amount of pages we're + * short freeing, amplified by the number of times we + * failed above. This generates a negative feedback loop: + * the more difficult it was to free pages, the easier we + * will make it. */ - swap_count = SWAP_COUNT; - while (swap_out(priority, gfp_mask)) + swap_count += count; + while (swap_out(priority, gfp_mask)) { if (--swap_count < 0) break; + } } while (--priority >= 0); /* Always end on a shrink_mmap.. */ while (shrink_mmap(0, gfp_mask)) { + ret = 1; if (!--count) goto done; } - /* We return 1 if we are freed some page */ - return (count != FREE_COUNT); done: - return 1; + return ret; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); --- linux-2.4.0-t1-ac7/mm/page_alloc.c.orig Wed May 31 14:08:50 2000 +++ linux-2.4.0-t1-ac7/mm/page_alloc.c Fri Jun 2 15:29:21 2000 @@ -222,6 +222,9 @@ { zone_t **zone = zonelist->zones; extern wait_queue_head_t kswapd_wait; + static int last_woke_kswapd; + static int kswapd_pause = HZ; + int gfp_mask = zonelist->gfp_mask; /* * (If anyone calls gfp from interrupts nonatomically then it @@ -248,14 +251,28 @@ } } - /* All zones are in need of kswapd. */ - if (waitqueue_active(&kswapd_wait)) + /* + * Kswapd should be freeing enough memory to satisfy all allocations + * immediately. Calling try_to_free_pages from processes will slow + * down the system a lot. On the other hand, waking up kswapd too + * often means wasted memory and cpu time. + * + * We tune the kswapd pause interval in such a way that kswapd is + * always just agressive enough to free the amount of memory we + * want freed. + */ + if (waitqueue_active(&kswapd_wait) && + time_after(jiffies, last_woke_kswapd + kswapd_pause)) { + kswapd_pause++; + last_woke_kswapd = jiffies; wake_up_interruptible(&kswapd_wait); + } /* * Ok, we don't have any zones that don't need some * balancing.. See if we have any that aren't critical.. */ +again: zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); @@ -267,16 +284,29 @@ z->low_on_memory = 1; if (page) return page; + } else { + if (kswapd_pause > 0) + kswapd_pause--; } } + /* We didn't kick kswapd often enough... */ + kswapd_pause /= 2; + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); + /* If we're low priority, we just wait a bit and try again later. */ + if ((gfp_mask & __GFP_WAIT) && current->need_resched && + current->state == TASK_RUNNING) { + schedule(); + goto again; + } + /* * Uhhuh. All the zones have been critical, which means that * we'd better do some synchronous swap-out. kswapd has not * been able to cope.. */ if (!(current->flags & PF_MEMALLOC)) { - int gfp_mask = zonelist->gfp_mask; if (!try_to_free_pages(gfp_mask)) { if (!(gfp_mask & __GFP_HIGH)) goto fail; @@ -303,7 +333,6 @@ zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); - int gfp_mask = zonelist->gfp_mask; if (!z) break; if (z->free_pages > z->pages_min) { --- linux-2.4.0-t1-ac7/mm/filemap.c.orig Wed May 31 14:08:50 2000 +++ linux-2.4.0-t1-ac7/mm/filemap.c Fri Jun 2 15:42:25 2000 @@ -334,13 +334,6 @@ count--; /* - * Page is from a zone we don't care about. - * Don't drop page cache entries in vain. - */ - if (page->zone->free_pages > page->zone->pages_high) - goto dispose_continue; - - /* * Avoid unscalable SMP locking for pages we can * immediate tell are untouchable.. */ @@ -375,6 +368,13 @@ } } + /* + * Page is from a zone we don't care about. + * Don't drop page cache entries in vain. + */ + if (page->zone->free_pages > page->zone->pages_high) + goto unlock_continue; + /* Take the pagecache_lock spinlock held to avoid other tasks to notice the page while we are looking at its page count. If it's a pagecache-page we'll free it @@ -400,8 +400,15 @@ goto made_inode_progress; } /* PageDeferswap -> we swap out the page now. */ - if (gfp_mask & __GFP_IO) - goto async_swap_continue; + if (gfp_mask & __GFP_IO) { + spin_unlock(&pagecache_lock); + /* Do NOT unlock the page ... brw_page does. */ + ClearPageDirty(page); + rw_swap_page(WRITE, page, 0); + spin_lock(&pagemap_lru_lock); + page_cache_release(page); + goto dispose_continue; + } goto cache_unlock_continue; } @@ -422,14 +429,6 @@ unlock_continue: spin_lock(&pagemap_lru_lock); UnlockPage(page); - page_cache_release(page); - goto dispose_continue; -async_swap_continue: - spin_unlock(&pagecache_lock); - /* Do NOT unlock the page ... that is done after IO. */ - ClearPageDirty(page); - rw_swap_page(WRITE, page, 0); - spin_lock(&pagemap_lru_lock); page_cache_release(page); dispose_continue: list_add(page_lru, &lru_cache); --- linux-2.4.0-t1-ac7/include/linux/swap.h.orig Wed May 31 21:00:06 2000 +++ linux-2.4.0-t1-ac7/include/linux/swap.h Thu Jun 1 11:51:25 2000 @@ -166,7 +166,7 @@ * The 2.4 code, however, is mostly simple and stable ;) */ #define PG_AGE_MAX 64 -#define PG_AGE_START 5 +#define PG_AGE_START 2 #define PG_AGE_ADV 3 #define PG_AGE_DECL 1
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Wed Jun 07 2000 - 21:00:17 EST