Re: [patch] my latest oom stuff

Andrea Arcangeli (andrea@e-mind.com)
Tue, 27 Oct 1998 01:08:17 +0100 (CET)


On Mon, 26 Oct 1998, Andrea Arcangeli wrote:

>Yes. I know that. Also Alpha could go out of tune. I am getting rid of the
>jiffies at all and instead I am making kswapd a real process that
>schedule() if need_resched is true. It will run all the time with a
>dynamic priority until there will be not enough memory and *_over_max()
>are true.

This new patch seems to works great here. (the patch to fork.c is
intentionally removed from the patch since the avoding of more of one
sigkill or sigbus is done right by the down() and then by the rechecking
of the memory state).

I can' t comment the patch in detail right now because I need to sleep a
bit before tomorrow ;-). Note that in get_free_pages() removing the
kswapd_wakeup() make no differences because at the time get_free_pages
failed, kswapd is sure just running all the time in bg. Note also that
using try_to_free_pages() before going in the core of __get_free_pages()
cause the kernel to left always some memory free (200/300kbyte here).
Instead running try_to_free_pages() only when we have just failed the GFP
core, we can handle more memory but we are more near to a real OOM. I
reproduced an OOM here (with a totally broken GFP) where there was a 0k of
RAM and 0k of SWAP and there seems to be no way to recover from it so I
can consider safe to waste always some memory to be a bit far from such
kind of total deadlock. As last thing the offending piece of
get_free_pages was the check for !(gfp_mask & (__GFP_HIGH | __GFP_MID)).

Here the patch against pre-2.1.127-1 (the 2.1.126 version is here:
ftp://e-mind.com/pub/linux/kernel-patches/oom-17-...).

Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.3 linux/mm/page_alloc.c:1.1.1.1.18.5
--- linux/mm/page_alloc.c:1.1.1.3 Sun Oct 25 01:28:52 1998
+++ linux/mm/page_alloc.c Tue Oct 27 00:51:28 1998
@@ -251,29 +251,14 @@
goto nopage;
}

- if (freepages.min > nr_free_pages) {
- int freed;
- freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
- /*
- * Low priority (user) allocations must not
- * succeed if we didn't have enough memory
- * and we couldn't get more..
- */
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
- }
+ if (freepages.min > nr_free_pages &&
+ !try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
+ goto nopage;
}
spin_lock_irqsave(&page_alloc_lock, flags);
RMQUEUE(order, (gfp_mask & GFP_DMA));
spin_unlock_irqrestore(&page_alloc_lock, flags);

- /*
- * If we failed to find anything, we'll return NULL, but we'll
- * wake up kswapd _now_ and even wait for it synchronously if
- * we can.. This way we'll at least make some forward progress
- * over time.
- */
- kswapd_notify(gfp_mask);
nopage:
return 0;
}
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.4 linux/mm/vmscan.c:1.1.1.2.4.20
--- linux/mm/vmscan.c:1.1.1.4 Sun Oct 25 01:28:52 1998
+++ linux/mm/vmscan.c Tue Oct 27 00:51:28 1998
@@ -442,39 +442,43 @@
static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
- int i=6;
- int stop;
+ int from_prio, to_prio;

/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);

/* We try harder if we are waiting .. */
- stop = 3;
if (gfp_mask & __GFP_WAIT)
- stop = 0;
+ {
+ from_prio = 3;
+ to_prio = 0;
+ } else {
+ from_prio = 6;
+ to_prio = 3;
+ }

if (buffer_over_borrow() || pgcache_over_borrow())
- shrink_mmap(i, gfp_mask);
+ state = 0;

switch (state) {
do {
case 0:
- if (shrink_mmap(i, gfp_mask))
+ if (shrink_mmap(from_prio, gfp_mask))
return 1;
state = 1;
case 1:
- if (shm_swap(i, gfp_mask))
+ if (shm_swap(from_prio, gfp_mask))
return 1;
state = 2;
case 2:
- if (swap_out(i, gfp_mask))
+ if (swap_out(from_prio, gfp_mask))
return 1;
state = 3;
case 3:
- shrink_dcache_memory(i, gfp_mask);
+ shrink_dcache_memory(from_prio, gfp_mask);
state = 0;
- i--;
- } while ((i - stop) >= 0);
+ from_prio--;
+ } while (from_prio >= to_prio);
}
return 0;
}
@@ -498,6 +502,30 @@
printk ("Starting kswapd v%.*s\n", i, s);
}

+#define kswapd_renice(freemem) \
+ (kswapd_task->priority = kswapd_priority(freemem))
+
+#define kswapd_done(freemem) \
+ (freemem == 2 && buffer_under_max() && pgcache_under_max())
+
+#define kswapd_schedule() \
+ if (kswapd_task->need_resched) \
+ schedule();
+
+static void kswapd_engine(void)
+{
+ for (;;)
+ {
+ int free_memory;
+ do_try_to_free_page(0);
+ free_memory = free_memory_available();
+ if (kswapd_done(free_memory))
+ break;
+ kswapd_renice(free_memory);
+ kswapd_schedule();
+ }
+}
+
/*
* The background pageout daemon.
* Started as a kernel thread from the init process.
@@ -517,13 +545,6 @@
lock_kernel();

/*
- * Set the base priority to something smaller than a
- * regular process. We will scale up the priority
- * dynamically depending on how much memory we need.
- */
- current->priority = (DEF_PRIORITY * 2) / 3;
-
- /*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "try_to_free_pages()"). "kswapd" should
@@ -540,44 +561,16 @@
init_swap_timer();
kswapd_task = current;
while (1) {
- int tries;
-
- current->state = TASK_INTERRUPTIBLE;
+/* run_task_queue(&tq_disk); */
flush_signals(current);
- run_task_queue(&tq_disk);
- schedule();
- swapstats.wakeups++;
-
/*
- * Do the background pageout: be
- * more aggressive if we're really
- * low on free memory.
- *
- * We try page_daemon.tries_base times, divided by
- * an 'urgency factor'. In practice this will mean
- * a value of pager_daemon.tries_base / 8 or 4 = 64
- * or 128 pages at a time.
- * This gives us 64 (or 128) * 4k * 4 (times/sec) =
- * 1 (or 2) MB/s swapping bandwidth in low-priority
- * background paging. This number rises to 8 MB/s
- * when the priority is highest (but then we'll be
- * woken up more often and the rate will be even
- * higher).
+ * Remeber to enable up the swap tick before go to sleep.
*/
- tries = pager_daemon.tries_base;
- tries >>= 4*free_memory_available();
-
- do {
- do_try_to_free_page(0);
- /*
- * Syncing large chunks is faster than swapping
- * synchronously (less head movement). -- Rik.
- */
- if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
- run_task_queue(&tq_disk);
- if (free_memory_available() > 1)
- break;
- } while (--tries > 0);
+ timer_active |= 1<<SWAP_TIMER;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ swapstats.wakeups++;
+ kswapd_engine();
}
/* As if we could ever get here - maybe we want to make this killable */
kswapd_task = NULL;
@@ -592,81 +585,53 @@
*
* The "PF_MEMALLOC" flag protects us against recursion:
* if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * will just silently return "fail" to tell the page
+ * allocator that we are OOM.
*/
int try_to_free_pages(unsigned int gfp_mask, int count)
{
- int retval = 1;
+ int retval = 0;

lock_kernel();
if (!(current->flags & PF_MEMALLOC)) {
+ static struct semaphore oom_sem = MUTEX;
current->flags |= PF_MEMALLOC;
- do {
+ down(&oom_sem);
+ /*
+ * We could have slept for a lot of time in down()
+ * so we check that we still need memory before
+ * risk to return OOM. -arca
+ */
+ if (freepages.min <= nr_free_pages)
+ retval = 1;
+ else while (count--)
+ {
retval = do_try_to_free_page(gfp_mask);
if (!retval)
break;
- count--;
- } while (count > 0);
+ }
+ up(&oom_sem);
current->flags &= ~PF_MEMALLOC;
}
unlock_kernel();
return retval;
}

-/*
- * Wake up kswapd according to the priority
- * 0 - no wakeup
- * 1 - wake up as a low-priority process
- * 2 - wake up as a normal process
- * 3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(int priority)
-{
- if (priority) {
- struct task_struct *p = kswapd_task;
- if (p) {
- p->counter = p->priority << priority;
- wake_up_process(p);
- }
- }
-}
-
/*
* The swap_tick function gets called on every clock tick.
*/
void swap_tick(void)
{
- unsigned int pages;
- int want_wakeup;
-
+ int free_memory = free_memory_available();
/*
* Schedule for wakeup if there isn't lots
* of free memory or if there is too much
* of it used for buffers or pgcache.
- *
- * "want_wakeup" is our priority: 0 means
- * not to wake anything up, while 3 means
- * that we'd better give kswapd a realtime
- * priority.
*/
- want_wakeup = 0;
- if (buffer_over_max() || pgcache_over_max())
- want_wakeup = 1;
- pages = nr_free_pages;
- if (pages < freepages.high)
- want_wakeup = 1;
- if (pages < freepages.low)
- want_wakeup = 2;
- if (pages < freepages.min)
- want_wakeup = 3;
-
- kswapd_wakeup(want_wakeup);
-
- timer_active |= (1<<SWAP_TIMER);
+ if (free_memory != 2 || buffer_over_max() || pgcache_over_max())
+ kswapd_wakeup(free_memory);
+ else
+ timer_active |= (1<<SWAP_TIMER);
}

/*
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.16.4
--- linux/include/linux/mm.h:1.1.1.3 Sun Oct 25 01:28:37 1998
+++ linux/include/linux/mm.h Tue Oct 27 00:51:35 1998
@@ -330,14 +330,30 @@
extern int free_memory_available(void);
extern struct task_struct * kswapd_task;

-extern inline void kswapd_notify(unsigned int gfp_mask)
+static inline long kswapd_priority(int free_memory)
{
- if (kswapd_task) {
- wake_up_process(kswapd_task);
- if (gfp_mask & __GFP_WAIT) {
- current->policy |= SCHED_YIELD;
- schedule();
- }
+ long priority;
+ switch (free_memory)
+ {
+ case 0:
+ priority = DEF_PRIORITY << 1;
+ break;
+ case 2:
+ priority = DEF_PRIORITY >> 1;
+ break;
+ default:
+ priority = DEF_PRIORITY;
+ }
+ return priority;
+}
+
+static inline void kswapd_wakeup(int free_memory)
+{
+ struct task_struct *p = kswapd_task;
+ if (p)
+ {
+ p->priority = kswapd_priority(free_memory);
+ wake_up_process(p);
}
}

Forget to tell, to do a really clean compile you must remove the `inline'
word before wake_up_process() in kernel/sched.c, to include that diff in
the patch I would have to reverse the schedule_timeout() stuff before and
I am a bit lazy ;-).

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/