Re: buffer/swapping in pre-patch-2.0.31-2 + the 17th July patch

Dr. Werner Fink (werner@suse.de)
Mon, 21 Jul 1997 22:15:37 +0200


> >looks like I'm catching this one:
>
> >Jul 19 16:07:15 arak kernel: try_to_free_page: free pages 1008kB, async pages 0kB
> >Jul 19 16:08:56 arak kernel: try_to_free_page: free pages 1020kB, async pages 12kB
> >Jul 19 16:12:04 arak kernel: try_to_free_page: free pages 1012kB, async pages 4kB

> Same here:
>
> Jul 21 07:57:25 smithers kernel: try_to_free_page: free pages 776kB, async pages 0kB
> Jul 21 07:57:25 smithers kernel: Appletalk 0.17 for Linux NET3.035
> Jul 21 09:15:08 smithers kernel: try_to_free_page: free pages 396kB, async pages 0kB
> Jul 21 09:15:09 smithers kernel: try_to_free_page: free pages 384kB, async pages 0kB
> Jul 21 09:15:29 smithers kernel: try_to_free_page: free pages 436kB, async pages 0kB
> Jul 21 09:15:30 smithers kernel: try_to_free_page: free pages 448kB, async pages 0kB

Hmmm ... interesting. Please try the appended patch. It contains a few
changes:

* run buffer wait queue only if recover_reusable_buffer_heads()
is called in brw_page() due get_more_buffer_heads()

* Set PAGE_AGE_VALUE to PAGE_INITIAL_AGE+PAGE_ADVANCE in
linux/include/linux/pagemap.h ... should avoid over-aged
cache pages

Are there any differences in bonnie? Is the debugging printk in
try_to_free_page() triggered?

Werner

-------------------------------------------------------------------------
diff -urN linux-2.0.31-2-davem/fs/buffer.c linux/fs/buffer.c
--- linux-2.0.31-2-davem/fs/buffer.c Fri Jul 18 11:08:48 1997
+++ linux/fs/buffer.c Mon Jul 21 21:05:53 1997
@@ -56,6 +56,7 @@
static struct buffer_head * reuse_list = NULL;
static struct wait_queue * buffer_wait = NULL;

+static int wakeup_on_io = 0;
static int nr_buffers = 0;
static int nr_buffers_type[NR_LIST] = {0,};
static int nr_buffer_heads = 0;
@@ -548,7 +549,7 @@

if (mem_map[MAP_NR((unsigned long) bh->b_data)].count != 1 ||
buffer_dirty(bh)) {
- refile_buffer(bh);
+ /* WSH: don't attempt to refile here! */
return 0;
}

@@ -669,12 +670,15 @@
};
}

- /* and repeat until we find something good */
- if (grow_buffers(GFP_ATOMIC, size))
- needed -= PAGE_SIZE;
- else
- wakeup_bdflush(1);
- goto repeat;
+ if (nr_free_pages > 5) {
+ /* and repeat until we find something good */
+ if (grow_buffers(GFP_ATOMIC, size)) {
+ needed -= PAGE_SIZE;
+ goto repeat;
+ };
+ }
+
+ wakeup_bdflush(1);
}

/*
@@ -922,6 +926,34 @@
wake_up(&buffer_wait);
}

+/*
+ * We can't put completed temporary IO buffer_heads directly onto the
+ * unused_list when they become unlocked, since the device driver
+ * end_request routines still expect access to the buffer_head's
+ * fields after the final unlock. So, the device driver puts them on
+ * the reuse_list instead once IO completes, and we recover these to
+ * the unused_list here.
+ *
+ * The reuse_list receives buffers from interrupt routines, so we need
+ * to be IRQ-safe here (but note that interrupts only _add_ to the
+ * reuse_list, never take away. So we don't need to worry about the
+ * reuse_list magically emptying).
+ */
+static inline void recover_reusable_buffer_heads(void)
+{
+ if (reuse_list) {
+ struct buffer_head *head;
+
+ head = xchg(&reuse_list, NULL);
+
+ do {
+ struct buffer_head *bh = head;
+ head = head->b_next_free;
+ put_unused_buffer_head(bh);
+ } while (head);
+ }
+}
+
static void get_more_buffer_heads(void)
{
struct buffer_head * bh;
@@ -949,38 +981,15 @@
*/
run_task_queue(&tq_disk);
sleep_on(&buffer_wait);
+ /*
+ * After we wake up, check for released async buffer heads.
+ */
+ recover_reusable_buffer_heads();
+ wakeup_on_io = 1;
}

}

-/*
- * We can't put completed temporary IO buffer_heads directly onto the
- * unused_list when they become unlocked, since the device driver
- * end_request routines still expect access to the buffer_head's
- * fields after the final unlock. So, the device driver puts them on
- * the reuse_list instead once IO completes, and we recover these to
- * the unused_list here.
- *
- * The reuse_list receives buffers from interrupt routines, so we need
- * to be IRQ-safe here (but note that interrupts only _add_ to the
- * reuse_list, never take away. So we don't need to worry about the
- * reuse_list magically emptying).
- */
-static inline void recover_reusable_buffer_heads(void)
-{
- if (reuse_list) {
- struct buffer_head *head;
-
- head = xchg(&reuse_list, NULL);
-
- do {
- struct buffer_head *bh = head;
- head = head->b_next_free;
- put_unused_buffer_head(bh);
- } while (head);
- }
-}
-
static struct buffer_head * get_unused_buffer_head(void)
{
struct buffer_head * bh;
@@ -1091,6 +1100,7 @@
* They do _not_ show up in the buffer hash table!
* They are _not_ registered in page->buffers either!
*/
+ wakeup_on_io = 0;
bh = create_buffers(page_address(page), size);
if (!bh) {
clear_bit(PG_locked, &page->flags);
@@ -1161,6 +1171,8 @@
free_async_buffers(bh);
restore_flags(flags);
after_unlock_page(page);
+ if (wakeup_on_io)
+ wake_up(&buffer_wait);
}
++current->maj_flt;
return 0;
@@ -1534,6 +1546,7 @@
next->b_count--;
}
}
+ run_task_queue(&tq_disk);
#ifdef DEBUG
if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
printk("Wrote %d/%d buffers\n", nwritten, ndirty);
diff -urN linux-2.0.31-2-davem/include/linux/pagemap.h linux/include/linux/pagemap.h
--- linux-2.0.31-2-davem/include/linux/pagemap.h Sat Mar 29 01:08:17 1997
+++ linux/include/linux/pagemap.h Mon Jul 21 20:05:20 1997
@@ -11,6 +11,7 @@

#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/swapctl.h>

static inline unsigned long page_address(struct page * page)
{
@@ -20,7 +20,7 @@
#define PAGE_HASH_BITS 11
#define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)

-#define PAGE_AGE_VALUE 16
+#define PAGE_AGE_VALUE ((PAGE_INITIAL_AGE)+(PAGE_ADVANCE))

extern unsigned long page_cache_size; /* # of pages currently in the hash table */
extern struct page * page_hash_table[PAGE_HASH_SIZE];
diff -urN linux-2.0.31-2-davem/ipc/shm.c linux/ipc/shm.c
--- linux-2.0.31-2-davem/ipc/shm.c Fri Nov 22 15:25:18 1996
+++ linux/ipc/shm.c Mon Jul 21 20:22:08 1997
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/malloc.h>
#include <linux/swap.h>
+#include <linux/swapctl.h>

#include <asm/segment.h>
#include <asm/pgtable.h>
@@ -672,6 +673,11 @@
shm_swp--;
}
shm_rss++;
+
+ /* Give the physical reallocated page a bigger start */
+ if (shm_rss < (MAP_NR(high_memory) >> 3))
+ mem_map[MAP_NR(page)].age = (PAGE_INITIAL_AGE + PAGE_ADVANCE);
+
pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
shp->shm_pages[idx] = pte_val(pte);
} else
diff -urN linux-2.0.31-2-davem/mm/filemap.c linux/mm/filemap.c
--- linux-2.0.31-2-davem/mm/filemap.c Fri Jul 18 11:08:53 1997
+++ linux/mm/filemap.c Fri Jul 18 11:11:53 1997
@@ -450,7 +450,7 @@

#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)

-#if 0 /* small readahead */
+#ifdef CONFIG_READA_SMALL /* small readahead */
#define MAX_READAHEAD PageAlignSize(4096*7)
#define MIN_READAHEAD PageAlignSize(4096*2)
#else /* large readahead */
diff -urN linux-2.0.31-2-davem/mm/mlock.c linux/mm/mlock.c
--- linux-2.0.31-2-davem/mm/mlock.c Wed Sep 11 16:57:19 1996
+++ linux/mm/mlock.c Fri Jul 18 11:11:53 1997
@@ -202,7 +202,7 @@

/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
- if (locked > MAP_NR(high_memory)/2)
+ if (locked > (MAP_NR(high_memory) >> 1))
return -ENOMEM;

return do_mlock(start, len, 1);
@@ -259,7 +259,7 @@

/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
- if (current->mm->total_vm > MAP_NR(high_memory)/2)
+ if (current->mm->total_vm > (MAP_NR(high_memory) >> 1))
return -ENOMEM;

return do_mlockall(flags);
diff -urN linux-2.0.31-2-davem/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.0.31-2-davem/mm/page_alloc.c Sat Aug 17 20:19:29 1996
+++ linux/mm/page_alloc.c Mon Jul 21 20:21:05 1997
@@ -264,11 +264,11 @@

/*
* select nr of pages we try to keep free for important stuff
- * with a minimum of 16 pages. This is totally arbitrary
+ * with a minimum of 24 pages. This is totally arbitrary
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
- if (i < 16)
- i = 16;
+ if (i < 24)
+ i = 24;
min_free_pages = i;
free_pages_low = i + (i>>1);
free_pages_high = i + i;
@@ -311,7 +311,8 @@
unsigned long page = __get_free_page(GFP_KERNEL);

if (pte_val(*page_table) != entry) {
- free_page(page);
+ if (page)
+ free_page(page);
return;
}
if (!page) {
@@ -327,6 +328,11 @@
}
vma->vm_mm->rss++;
tsk->maj_flt++;
+
+ /* Give the physical reallocated page a bigger start */
+ if (vma->vm_mm->rss < (MAP_NR(high_memory) >> 2))
+ mem_map[MAP_NR(page)].age = (PAGE_INITIAL_AGE + PAGE_ADVANCE);
+
if (!write_access && add_to_swap_cache(MAP_NR(page), entry)) {
/* keep swap page allocated for the moment (swap cache) */
set_pte(page_table, mk_pte(page, vma->vm_page_prot));
diff -urN linux-2.0.31-2-davem/mm/vmscan.c linux/mm/vmscan.c
--- linux-2.0.31-2-davem/mm/vmscan.c Sat Dec 14 13:24:31 1996
+++ linux/mm/vmscan.c Fri Jul 18 11:33:42 1997
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/swapctl.h>
+#include <linux/pagemap.h>
#include <linux/smp_lock.h>

#include <asm/dma.h>
@@ -32,6 +33,13 @@
*/
static int next_swap_jiffies = 0;

+/*
+ * Was the last kswapd wakeup caused by
+ * nr_free_pages < free_pages_low
+ */
+static int last_wakeup_low = 0;
+
+
/*
* How often do we do a pageout scan during normal conditions?
* Default is four times a second.
@@ -330,7 +338,7 @@
* to be. This works out OK, because we now do proper aging on page
* contents.
*/
-int try_to_free_page(int priority, int dma, int wait)
+static inline int do_try_to_free_page(int priority, int dma, int wait)
{
static int state = 0;
int i=6;
@@ -343,23 +351,48 @@
switch (state) {
do {
case 0:
+ barrier();
if (shrink_mmap(i, dma))
return 1;
state = 1;
+ barrier();
case 1:
+ barrier();
if (shm_swap(i, dma))
return 1;
state = 2;
+ barrier();
default:
+ barrier();
if (swap_out(i, dma, wait))
return 1;
state = 0;
+ barrier();
i--;
} while ((i - stop) >= 0);
}
return 0;
}

+int try_to_free_page(int priority, int dma, int wait)
+{
+ int retval, run_dtq = 0;
+
+repeat:
+ retval = do_try_to_free_page(priority,dma,wait);
+ if (retval)
+ return retval;
+ if (wait && !dma && !kswapd_awake) {
+ printk("try_to_free_page: free pages %6dkB, async pages %6dkB\n",
+ nr_free_pages<<(PAGE_SHIFT-10), nr_async_pages<<(PAGE_SHIFT-10));
+ if (run_dtq)
+ return retval;
+ run_task_queue(&tq_disk);
+ run_dtq++;
+ goto repeat;
+ }
+ return retval;
+}

/*
* The background pageout daemon.
@@ -402,6 +435,9 @@
printk ("Started kswapd v%.*s\n", i, s);

while (1) {
+ /* low on memory, we need to start swapping soon */
+ next_swap_jiffies = jiffies +
+ (last_wakeup_low ? swapout_interval >> 1 : swapout_interval);
kswapd_awake = 0;
current->signal = 0;
run_task_queue(&tq_disk);
@@ -410,7 +446,8 @@
swapstats.wakeups++;
/* Do the background pageout: */
for (i=0; i < kswapd_ctl.maxpages; i++)
- try_to_free_page(GFP_KERNEL, 0, 0);
+ try_to_free_page(GFP_KERNEL, 0,
+ (nr_free_pages < min_free_pages));
}
}

@@ -421,16 +458,15 @@
void swap_tick(void)
{
int want_wakeup = 0;
- static int last_wakeup_low = 0;

if ((nr_free_pages + nr_async_pages) < free_pages_low) {
if (last_wakeup_low)
- want_wakeup = jiffies >= next_swap_jiffies;
+ want_wakeup = (jiffies >= next_swap_jiffies);
else
last_wakeup_low = want_wakeup = 1;
}
else if (((nr_free_pages + nr_async_pages) < free_pages_high) &&
- jiffies >= next_swap_jiffies) {
+ (jiffies >= next_swap_jiffies)) {
last_wakeup_low = 0;
want_wakeup = 1;
}
@@ -440,7 +476,6 @@
wake_up(&kswapd_wait);
need_resched = 1;
}
- next_swap_jiffies = jiffies + swapout_interval;
}
timer_active |= (1<<SWAP_TIMER);
}
diff -urN linux-2.0.31-2-davem/net/ipx/af_ipx.c linux/net/ipx/af_ipx.c
--- linux-2.0.31-2-davem/net/ipx/af_ipx.c Fri Jul 18 11:08:54 1997
+++ linux/net/ipx/af_ipx.c Fri Jul 18 11:11:53 1997
@@ -1776,6 +1776,7 @@
}
sk->rcvbuf=SK_RMEM_MAX;
sk->sndbuf=SK_WMEM_MAX;
+ sk->allocation=GFP_KERNEL;
sk->prot=NULL; /* So we use default free mechanisms */
skb_queue_head_init(&sk->receive_queue);
skb_queue_head_init(&sk->write_queue);