Simple fix for swap_out (the 1st August patch)

Dr. Werner Fink (werner@suse.de)
Fri, 1 Aug 1997 16:37:32 +0200


NOTE: The appended patch is against pre-patch-2.0.31-2.

Hi,

A few day I wrote:

> swap_out() of mm/vmscan.c:
> In some situations it is possible that we cross all task without getting
> a aged page for freeing it. The result of such a situation is that the
> variable loop is set to 1. Under high load this can lead to the
> nasty "Couldn't get free page" in the next try.

not true and stupid but harmless. The code of Kai Petzke is really a good
one ... next time I should read code more carefully.

But the problem of the coupling of ageing and freeing of task pages still
exists. A comparison with the old 1.2.13 code one can find with a few
statistics, some sorts of stress tests, and debugging printks (and reports
given to this list) the reason for "Couldn't get free page".
One possible solution without slowing down idle systems but helping highly
stressed ones is given in the appended patch.

Please read the comment in swap_out() of mm/vmscan.c. This change should go
to 2.1.xx.

Werner

----------------------------------------------------------------------------
diff -urN -x *~1 linux-2.0.31-2-davem/fs/buffer.c linux/fs/buffer.c
--- linux-2.0.31-2-davem/fs/buffer.c Fri Jul 18 11:08:48 1997
+++ linux/fs/buffer.c Tue Jul 22 19:28:06 1997
@@ -548,7 +548,7 @@

if (mem_map[MAP_NR((unsigned long) bh->b_data)].count != 1 ||
buffer_dirty(bh)) {
- refile_buffer(bh);
+ /* WSH: don't attempt to refile here! */
return 0;
}

@@ -566,7 +566,7 @@
for (bh = list;
bh && (*list_len) > 0;
bh = bh->b_next_free, (*list_len)--) {
- if (size != bh->b_size) {
+ if (size != bh->b_size && nr_free_pages < min_free_pages) {
/* this provides a mechanism for freeing blocks
of other sizes, this is necessary now that we
no longer have the lav code. */
@@ -669,12 +669,15 @@
};
}

- /* and repeat until we find something good */
- if (grow_buffers(GFP_ATOMIC, size))
- needed -= PAGE_SIZE;
- else
- wakeup_bdflush(1);
- goto repeat;
+ if (nr_free_pages > 5) {
+ /* and repeat until we find something good */
+ if (grow_buffers(GFP_ATOMIC, size)) {
+ needed -= PAGE_SIZE;
+ goto repeat;
+ };
+ }
+
+ wakeup_bdflush(1);
}

/*
@@ -922,6 +925,34 @@
wake_up(&buffer_wait);
}

+/*
+ * We can't put completed temporary IO buffer_heads directly onto the
+ * unused_list when they become unlocked, since the device driver
+ * end_request routines still expect access to the buffer_head's
+ * fields after the final unlock. So, the device driver puts them on
+ * the reuse_list instead once IO completes, and we recover these to
+ * the unused_list here.
+ *
+ * The reuse_list receives buffers from interrupt routines, so we need
+ * to be IRQ-safe here (but note that interrupts only _add_ to the
+ * reuse_list, never take away. So we don't need to worry about the
+ * reuse_list magically emptying).
+ */
+static inline void recover_reusable_buffer_heads(void)
+{
+ if (reuse_list) {
+ struct buffer_head *head;
+
+ head = xchg(&reuse_list, NULL);
+
+ do {
+ struct buffer_head *bh = head;
+ head = head->b_next_free;
+ put_unused_buffer_head(bh);
+ } while (head);
+ }
+}
+
static void get_more_buffer_heads(void)
{
struct buffer_head * bh;
@@ -949,38 +980,14 @@
*/
run_task_queue(&tq_disk);
sleep_on(&buffer_wait);
+ /*
+ * After we wake up, check for released async buffer heads.
+ */
+ recover_reusable_buffer_heads();
}

}

-/*
- * We can't put completed temporary IO buffer_heads directly onto the
- * unused_list when they become unlocked, since the device driver
- * end_request routines still expect access to the buffer_head's
- * fields after the final unlock. So, the device driver puts them on
- * the reuse_list instead once IO completes, and we recover these to
- * the unused_list here.
- *
- * The reuse_list receives buffers from interrupt routines, so we need
- * to be IRQ-safe here (but note that interrupts only _add_ to the
- * reuse_list, never take away. So we don't need to worry about the
- * reuse_list magically emptying).
- */
-static inline void recover_reusable_buffer_heads(void)
-{
- if (reuse_list) {
- struct buffer_head *head;
-
- head = xchg(&reuse_list, NULL);
-
- do {
- struct buffer_head *bh = head;
- head = head->b_next_free;
- put_unused_buffer_head(bh);
- } while (head);
- }
-}
-
static struct buffer_head * get_unused_buffer_head(void)
{
struct buffer_head * bh;
@@ -1161,6 +1168,8 @@
free_async_buffers(bh);
restore_flags(flags);
after_unlock_page(page);
+ if (waitqueue_active(&buffer_wait))
+ wake_up(&buffer_wait);
}
++current->maj_flt;
return 0;
@@ -1534,6 +1543,7 @@
next->b_count--;
}
}
+ run_task_queue(&tq_disk);
#ifdef DEBUG
if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
printk("Wrote %d/%d buffers\n", nwritten, ndirty);
diff -urN -x *~1 linux-2.0.31-2-davem/include/linux/pagemap.h linux/include/linux/pagemap.h
--- linux-2.0.31-2-davem/include/linux/pagemap.h Sat Mar 29 01:08:17 1997
+++ linux/include/linux/pagemap.h Wed Jul 30 21:27:22 1997
@@ -11,6 +11,7 @@

#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/swapctl.h>

static inline unsigned long page_address(struct page * page)
{
@@ -20,7 +21,7 @@
#define PAGE_HASH_BITS 11
#define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)

-#define PAGE_AGE_VALUE 16
+#define PAGE_AGE_VALUE ((PAGE_INITIAL_AGE)+(PAGE_ADVANCE))

extern unsigned long page_cache_size; /* # of pages currently in the hash table */
extern struct page * page_hash_table[PAGE_HASH_SIZE];
diff -urN -x *~1 linux-2.0.31-2-davem/ipc/shm.c linux/ipc/shm.c
--- linux-2.0.31-2-davem/ipc/shm.c Fri Nov 22 15:25:18 1996
+++ linux/ipc/shm.c Mon Jul 21 20:22:08 1997
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/malloc.h>
#include <linux/swap.h>
+#include <linux/swapctl.h>

#include <asm/segment.h>
#include <asm/pgtable.h>
@@ -672,6 +673,11 @@
shm_swp--;
}
shm_rss++;
+
+ /* Give the physical reallocated page a bigger start */
+ if (shm_rss < (MAP_NR(high_memory) >> 3))
+ mem_map[MAP_NR(page)].age = (PAGE_INITIAL_AGE + PAGE_ADVANCE);
+
pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
shp->shm_pages[idx] = pte_val(pte);
} else
diff -urN -x *~1 linux-2.0.31-2-davem/mm/filemap.c linux/mm/filemap.c
--- linux-2.0.31-2-davem/mm/filemap.c Fri Jul 18 11:08:53 1997
+++ linux/mm/filemap.c Fri Jul 25 14:43:26 1997
@@ -102,7 +102,7 @@
/* page wholly truncated - free it */
if (offset >= start) {
if (PageLocked(page)) {
- wait_on_page(page);
+ __wait_on_page(page);
goto repeat;
}
inode->i_nrpages--;
@@ -171,8 +171,12 @@
switch (page->count) {
case 1:
/* If it has been referenced recently, don't free it */
- if (clear_bit(PG_referenced, &page->flags))
+ if (clear_bit(PG_referenced, &page->flags)) {
+ /* age this page potential used */
+ if (priority < 4)
+ age_page(page);
break;
+ }

/* is it a page cache page? */
if (page->inode) {
@@ -450,7 +454,7 @@

#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)

-#if 0 /* small readahead */
+#ifdef CONFIG_READA_SMALL /* small readahead */
#define MAX_READAHEAD PageAlignSize(4096*7)
#define MIN_READAHEAD PageAlignSize(4096*2)
#else /* large readahead */
@@ -1013,6 +1017,8 @@
unsigned long page;
int error;

+ if (pte_none(pte))
+ return 0;
if (!(flags & MS_INVALIDATE)) {
if (!pte_present(pte))
return 0;
@@ -1025,8 +1031,6 @@
page = pte_page(pte);
mem_map[MAP_NR(page)].count++;
} else {
- if (pte_none(pte))
- return 0;
flush_cache_page(vma, address);
pte_clear(ptep);
flush_tlb_page(vma, address);
diff -urN -x *~1 linux-2.0.31-2-davem/mm/mlock.c linux/mm/mlock.c
--- linux-2.0.31-2-davem/mm/mlock.c Wed Sep 11 16:57:19 1996
+++ linux/mm/mlock.c Fri Jul 18 11:11:53 1997
@@ -202,7 +202,7 @@

/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
- if (locked > MAP_NR(high_memory)/2)
+ if (locked > (MAP_NR(high_memory) >> 1))
return -ENOMEM;

return do_mlock(start, len, 1);
@@ -259,7 +259,7 @@

/* we may lock at most half of physical memory... */
/* (this check is pretty bogus, but doesn't hurt) */
- if (current->mm->total_vm > MAP_NR(high_memory)/2)
+ if (current->mm->total_vm > (MAP_NR(high_memory) >> 1))
return -ENOMEM;

return do_mlockall(flags);
diff -urN -x *~1 linux-2.0.31-2-davem/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.0.31-2-davem/mm/page_alloc.c Sat Aug 17 20:19:29 1996
+++ linux/mm/page_alloc.c Mon Jul 21 20:21:05 1997
@@ -264,11 +264,11 @@

/*
* select nr of pages we try to keep free for important stuff
- * with a minimum of 16 pages. This is totally arbitrary
+ * with a minimum of 24 pages. This is totally arbitrary
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
- if (i < 16)
- i = 16;
+ if (i < 24)
+ i = 24;
min_free_pages = i;
free_pages_low = i + (i>>1);
free_pages_high = i + i;
@@ -311,7 +311,8 @@
unsigned long page = __get_free_page(GFP_KERNEL);

if (pte_val(*page_table) != entry) {
- free_page(page);
+ if (page)
+ free_page(page);
return;
}
if (!page) {
@@ -327,6 +328,11 @@
}
vma->vm_mm->rss++;
tsk->maj_flt++;
+
+ /* Give the physical reallocated page a bigger start */
+ if (vma->vm_mm->rss < (MAP_NR(high_memory) >> 2))
+ mem_map[MAP_NR(page)].age = (PAGE_INITIAL_AGE + PAGE_ADVANCE);
+
if (!write_access && add_to_swap_cache(MAP_NR(page), entry)) {
/* keep swap page allocated for the moment (swap cache) */
set_pte(page_table, mk_pte(page, vma->vm_page_prot));
diff -urN -x *~1 linux-2.0.31-2-davem/mm/vmscan.c linux/mm/vmscan.c
--- linux-2.0.31-2-davem/mm/vmscan.c Sat Dec 14 13:24:31 1996
+++ linux/mm/vmscan.c Thu Jul 31 15:14:59 1997
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/swapctl.h>
+#include <linux/pagemap.h>
#include <linux/smp_lock.h>

#include <asm/dma.h>
@@ -27,11 +28,22 @@
#include <asm/bitops.h>
#include <asm/pgtable.h>

+/*
+ * To check memory consuming code elsewhere set this to 1
+ */
+#define MM_DEBUG 0
+
/*
* When are we next due for a page scan?
*/
static int next_swap_jiffies = 0;

+/*
+ * Was the last kswapd wakeup caused by
+ * nr_free_pages < free_pages_low
+ */
+static int last_wakeup_low = 0;
+
/*
* How often do we do a pageout scan during normal conditions?
* Default is four times a second.
@@ -275,10 +287,42 @@
static int swap_out(unsigned int priority, int dma, int wait)
{
static int swap_task;
- int loop, counter;
+ int loop, counter, shfrv;
struct task_struct *p;

- counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+#ifdef MM_DEBUG
+ shfrv = 10;
+#else
+ /*
+ * Trouble due ageing pages: In some situations it is possible that we cross only tasks
+ * which are swapped out or which have only physical pages with age >= 3.
+ * High values of swap_cnt for memory consuming tasks do aggravate such situations.
+ *
+ * If PAGEOUT_WEIGHT has a value of 8192 a right shift value of 10 leads to
+ * (8 * nr_tasks) >> priority
+ * Together with a high number of tasks, say 100, we have counters (due priority)
+ * 12(6) + 25(5) + 50(4) + 100(3) + 200(2) + 400(1) + 800(0)
+ * and as total result 1587 scans of swap_out() to swap out a task page.
+ *
+ * Just assume 80 tasks are swapped out and the remaining tasks have a swap_cnt value >= 40
+ * together with pages with age >= 3. Then we need approx 20*40*2 = 1600 scans to get a
+ * free page.
+ * And now assume that the amount of cached pages, buffers, and ipc pages are really low.
+ */
+ switch (priority) {
+ case 6: case 5: case 4: /* be friendly */
+ shfrv = 10;
+ break;
+ case 3: case 2: case 1: /* more intensive */
+ shfrv = 9;
+ break;
+ case 0: default: /* sorry we need a page */
+ shfrv = 8;
+ break;
+ }
+#endif
+
+ counter = ((PAGEOUT_WEIGHT * nr_tasks) >> shfrv) >> priority;
for(; counter >= 0; counter--) {
/*
* Check that swap_task is suitable for swapping. If not, look for
@@ -305,14 +349,19 @@
* Determine the number of pages to swap from this process.
*/
if (!p->swap_cnt) {
- /* Normalise the number of pages swapped by
- multiplying by (RSS / 1MB) */
+ /*
+ * Normalise the number of pages swapped by
+ * multiplying by (RSS / 1MB)
+ */
p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
}
if (!--p->swap_cnt)
swap_task++;
switch (swap_out_process(p, dma, wait)) {
case 0:
+ if (p->state == TASK_STOPPED)
+ /* Stopped task occupy nonused ram */
+ break;
if (p->swap_cnt)
swap_task++;
break;
@@ -322,6 +371,14 @@
break;
}
}
+#ifdef MM_DEBUG
+ if (!priority) {
+ printk("swap_out: physical ram %6dkB, min pages %6dkB\n",
+ (int)(high_memory>>10), min_free_pages<<(PAGE_SHIFT-10));
+ printk("swap_out: free pages %6dkB, async pages %6dkB\n",
+ nr_free_pages<<(PAGE_SHIFT-10), nr_async_pages<<(PAGE_SHIFT-10));
+ }
+#endif
return 0;
}

@@ -402,6 +458,9 @@
printk ("Started kswapd v%.*s\n", i, s);

while (1) {
+ /* low on memory, we need to start swapping soon */
+ next_swap_jiffies = jiffies +
+ (last_wakeup_low ? swapout_interval >> 1 : swapout_interval);
kswapd_awake = 0;
current->signal = 0;
run_task_queue(&tq_disk);
@@ -410,7 +469,8 @@
swapstats.wakeups++;
/* Do the background pageout: */
for (i=0; i < kswapd_ctl.maxpages; i++)
- try_to_free_page(GFP_KERNEL, 0, 0);
+ try_to_free_page(GFP_KERNEL, 0,
+ (nr_free_pages < min_free_pages));
}
}

@@ -421,16 +481,15 @@
void swap_tick(void)
{
int want_wakeup = 0;
- static int last_wakeup_low = 0;

if ((nr_free_pages + nr_async_pages) < free_pages_low) {
if (last_wakeup_low)
- want_wakeup = jiffies >= next_swap_jiffies;
+ want_wakeup = (jiffies >= next_swap_jiffies);
else
last_wakeup_low = want_wakeup = 1;
}
else if (((nr_free_pages + nr_async_pages) < free_pages_high) &&
- jiffies >= next_swap_jiffies) {
+ (jiffies >= next_swap_jiffies)) {
last_wakeup_low = 0;
want_wakeup = 1;
}
@@ -440,7 +499,6 @@
wake_up(&kswapd_wait);
need_resched = 1;
}
- next_swap_jiffies = jiffies + swapout_interval;
}
timer_active |= (1<<SWAP_TIMER);
}
diff -urN -x *~1 linux-2.0.31-2-davem/net/ipv4/icmp.c linux/net/ipv4/icmp.c
--- linux-2.0.31-2-davem/net/ipv4/icmp.c Fri Jul 18 11:08:53 1997
+++ linux/net/ipv4/icmp.c Fri Jul 25 17:56:10 1997
@@ -829,9 +829,10 @@

#if !defined(CONFIG_IP_DUMB_ROUTER)
if (sysctl_ip_forward) {
- NETDEBUG(printk(KERN_INFO "icmp: ICMP redirect ignored. dest = %lX, "
- "orig gw = %lX, \"new\" gw = %lX, device = %s.\n", ntohl(ip),
- ntohl(source), ntohl(icmph->un.gateway), dev->name));
+ NETDEBUG(printk(KERN_INFO "icmp: ICMP redirect ignored. dest = %lX, "
+ "orig gw = %lX, \"new\" gw = %lX, device = %s.\n", ntohl(ip),
+ ntohl(source), ntohl(icmph->un.gateway), dev->name));
+ goto flush_it;
}
#else
switch(icmph->code & 7)
diff -urN -x *~1 linux-2.0.31-2-davem/net/ipv4/ip_fragment.c linux/net/ipv4/ip_fragment.c
--- linux-2.0.31-2-davem/net/ipv4/ip_fragment.c Fri Jul 18 11:08:53 1997
+++ linux/net/ipv4/ip_fragment.c Wed Jul 30 21:08:06 1997
@@ -388,6 +388,8 @@
fp = fp->next;
}

+ skb->pkt_type = qp->fragments->skb->pkt_type;
+ skb->protocol = qp->fragments->skb->protocol;
/* We glued together all fragments, so remove the queue entry. */
ip_free(qp);

diff -urN -x *~1 linux-2.0.31-2-davem/net/ipx/af_ipx.c linux/net/ipx/af_ipx.c
--- linux-2.0.31-2-davem/net/ipx/af_ipx.c Fri Jul 18 11:08:54 1997
+++ linux/net/ipx/af_ipx.c Fri Jul 18 11:11:53 1997
@@ -1776,6 +1776,7 @@
}
sk->rcvbuf=SK_RMEM_MAX;
sk->sndbuf=SK_WMEM_MAX;
+ sk->allocation=GFP_KERNEL;
sk->prot=NULL; /* So we use default free mechanisms */
skb_queue_head_init(&sk->receive_queue);
skb_queue_head_init(&sk->write_queue);