Hi Linus,
the following patch contains 2 fixes and one addition
to the VM layer:
1. Roger Larson's fix to make sure there is no
"1 page gap" between the point where __alloc_pages()
goes to sleep and kswapd() wakes up <== livelock fix
2. fix the calculation of freepages.{min,low,high} to better
reflect the reality of having per-zone tunable free
memory target <== balancing fix
3. add the out of memory killer, which has been tuned with
-test9 to be ran at exactly the right moment; process
selection: "principle of least surprise" <== OOM handling
regards,
Rik
-- "What you're running that piece of shit Gnome?!?!" -- Miguel de Icaza, UKUUG 2000http://www.conectiva.com/ http://www.surriel.com/
--- linux-2.4.0-test9/fs/buffer.c.orig Tue Oct 3 10:19:10 2000 +++ linux-2.4.0-test9/fs/buffer.c Tue Oct 3 12:25:23 2000 @@ -706,7 +706,7 @@ static void refill_freelist(int size) { if (!grow_buffers(size)) { - wakeup_bdflush(1); + wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */ current->policy |= SCHED_YIELD; schedule(); } --- linux-2.4.0-test9/mm/highmem.c.orig Tue Oct 3 10:20:41 2000 +++ linux-2.4.0-test9/mm/highmem.c Tue Oct 3 12:25:44 2000 @@ -310,7 +310,7 @@ repeat_bh: bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); if (!bh) { - wakeup_bdflush(1); + wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */ current->policy |= SCHED_YIELD; schedule(); goto repeat_bh; @@ -324,7 +324,7 @@ repeat_page: page = alloc_page(GFP_BUFFER); if (!page) { - wakeup_bdflush(1); + wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */ current->policy |= SCHED_YIELD; schedule(); goto repeat_page; --- linux-2.4.0-test9/mm/page_alloc.c.orig Tue Oct 3 10:20:41 2000 +++ linux-2.4.0-test9/mm/page_alloc.c Fri Oct 6 15:45:36 2000 @@ -268,7 +268,8 @@ water_mark = z->pages_high; } - if (z->free_pages + z->inactive_clean_pages > water_mark) { + /* Use >= to have one page overlap with free_shortage() !! */ + if (z->free_pages + z->inactive_clean_pages >= water_mark) { struct page *page = NULL; /* If possible, reclaim a page directly. */ if (direct_reclaim && z->free_pages < z->pages_min + 8) @@ -795,21 +796,6 @@ printk("On node %d totalpages: %lu\n", nid, realtotalpages); - /* - * Select nr of pages we try to keep free for important stuff - * with a minimum of 10 pages and a maximum of 256 pages, so - * that we don't waste too much memory on large systems. - * This is fairly arbitrary, but based on some behaviour - * analysis. - */ - i = realtotalpages >> 7; - if (i < 10) - i = 10; - if (i > 256) - i = 256; - freepages.min += i; - freepages.low += i * 2; - freepages.high += i * 3; memlist_init(&active_list); memlist_init(&inactive_dirty_list); @@ -875,6 +861,20 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; + /* + * Add these free targets to the global free target; + * we have to be SURE that freepages.high is higher + * than SUM [zone->pages_min] for all zones, otherwise + * we may have bad bad problems. + * + * This means we cannot make the freepages array writable + * in /proc, but have to add a separate extra_free_target + * for people who require it to catch load spikes in eg. + * gigabit ethernet routing... + */ + freepages.min += mask; + freepages.low += mask*2; + freepages.high += mask*3; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; --- linux-2.4.0-test9/mm/vmscan.c.orig Tue Oct 3 10:20:41 2000 +++ linux-2.4.0-test9/mm/vmscan.c Fri Oct 6 15:46:14 2000 @@ -837,8 +837,9 @@ for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; if (zone->size && (zone->inactive_clean_pages + - zone->free_pages < zone->pages_min)) { - sum += zone->pages_min; + zone->free_pages < zone->pages_min+1)) { + /* + 1 to have overlap with alloc_pages() !! */ + sum += zone->pages_min + 1; sum -= zone->free_pages; sum -= zone->inactive_clean_pages; } @@ -1095,12 +1096,20 @@ * We go to sleep for one second, but if it's needed * we'll be woken up earlier... */ - if (!free_shortage() || !inactive_shortage()) + if (!free_shortage() || !inactive_shortage()) { interruptible_sleep_on_timeout(&kswapd_wait, HZ); /* - * TODO: insert out of memory check & oom killer - * invocation in an else branch here. + * If we couldn't free enough memory, we see if it was + * due to the system just not having enough memory. + * If that is the case, the only solution is to kill + * a process (the alternative is enternal deadlock). + * + * If there still is enough memory around, we just loop + * and try free some more memory... */ + } else if (out_of_memory()) { + oom_kill(); + } } } --- linux-2.4.0-test9/mm/Makefile.orig Wed Oct 4 21:11:05 2000 +++ linux-2.4.0-test9/mm/Makefile Wed Oct 4 21:11:13 2000 @@ -10,7 +10,7 @@ O_TARGET := mm.o O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ - page_alloc.o swap_state.o swapfile.o numa.o + page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o ifeq ($(CONFIG_HIGHMEM),y) O_OBJS += highmem.o --- linux-2.4.0-test9/mm/oom_kill.c.orig Wed Oct 4 21:12:51 2000 +++ linux-2.4.0-test9/mm/oom_kill.c Fri Oct 6 15:35:29 2000 @@ -0,0 +1,210 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from kswapd() + * in linux/mm/vmscan.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/timex.h> + +/* #define DEBUG */ + +/** + * int_sqrt - oom_kill.c internal function, rough approximation to sqrt + * @x: integer of which to calculate the sqrt + * + * A very rough approximation to the sqrt() function. + */ +static unsigned int int_sqrt(unsigned int x) +{ + unsigned int out = x; + while (x & ~(unsigned int)1) x >>=2, out >>=1; + if (x) out -= out >> 2; + return (out ? out : 1); +} + +/** + * oom_badness - calculate a numeric value for how bad this task has been + * @p: task struct of which task we should calculate + * + * The formula used is relatively simple and documented inline in the + * function. The main rationale is that we want to select a good task + * to kill when we run out of memory. + * + * Good in this context means that: + * 1) we lose the minimum amount of work done + * 2) we recover a large amount of memory + * 3) we don't kill anything innocent of eating tons of memory + * 4) we want to kill the minimum amount of processes (one) + * 5) we try to kill the process the user expects us to kill, this + * algorithm has been meticulously tuned to meet the priniciple + * of least surprise ... (be careful when you change it) + */ + +static int badness(struct task_struct *p) +{ + int points, cpu_time, run_time; + + if (!p->mm) + return 0; + /* + * The memory size of the process is the basis for the badness. + */ + points = p->mm->total_vm; + + /* + * CPU time is in seconds and run time is in minutes. There is no + * particular reason for this other than that it turned out to work + * very well in practice. This is not safe against jiffie wraps + * but we don't care _that_ much... + */ + cpu_time = (p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3); + run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10); + + points /= int_sqrt(cpu_time); + points /= int_sqrt(int_sqrt(run_time)); + + /* + * Niced processes are most likely less important, so double + * their badness points. + */ + if (p->nice > 0) + points *= 2; + + /* + * Superuser processes are usually more important, so we make it + * less likely that we kill those. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || + p->uid == 0 || p->euid == 0) + points /= 4; + + /* + * We don't want to kill a process with direct hardware access. + * Not only could that mess up the hardware, but usually users + * tend to only have this flag set on applications they think + * of as important. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + points /= 4; +#ifdef DEBUG + printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + p->pid, p->comm, points); +#endif + return points; +} + +/* + * Simple selection loop. We chose the process with the highest + * number of 'points'. We need the locks to make sure that the + * list of task structs doesn't change while we look the other way. + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +static struct task_struct * select_bad_process(void) +{ + int points = 0, maxpoints = 0; + struct task_struct *p = NULL; + struct task_struct *chosen = NULL; + + read_lock(&tasklist_lock); + for_each_task(p) + { + if (p->pid) + points = badness(p); + if (points > maxpoints) { + chosen = p; + maxpoints = points; + } + } + read_unlock(&tasklist_lock); + return chosen; +} + +/** + * oom_kill - kill the "best" process when we run out of memory + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + * + * We must be careful though to never send SIGKILL a process with + * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that + * we select a process with CAP_SYS_RAW_IO set). + */ +void oom_kill(void) +{ + + struct task_struct *p = select_bad_process(); + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (p == NULL) + panic("Out of memory and no killable processes...\n"); + + printk(KERN_ERR "Out of Memory: Killed process %d (%s).", p->pid, p->comm); + + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... + */ + p->counter = 5 * HZ; + p->flags |= PF_MEMALLOC; + + /* This process has hardware access, be more careful. */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) { + force_sig(SIGTERM, p); + } else { + force_sig(SIGKILL, p); + } + + /* + * Make kswapd go out of the way, so "p" has a good chance of + * killing itself before someone else gets the chance to ask + * for more memory. + */ + current->policy |= SCHED_YIELD; + schedule(); + return; +} + +/** + * out_of_memory - is the system out of memory? + * + * Returns 0 if there is still enough memory left, + * 1 when we are out of memory (otherwise). + */ +int out_of_memory(void) +{ + struct sysinfo swp_info; + + /* Enough free memory? Not OOM. */ + if (nr_free_pages() > freepages.min) + return 0; + + if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low) + return 0; + + /* Enough swap space left? Not OOM. */ + si_swapinfo(&swp_info); + if (swp_info.freeswap > 0) + return 0; + + /* Else... */ + return 1; +} --- linux-2.4.0-test9/include/linux/swap.h.orig Fri Oct 6 12:33:05 2000 +++ linux-2.4.0-test9/include/linux/swap.h Fri Oct 6 12:33:48 2000 @@ -126,6 +126,10 @@ extern struct page * read_swap_cache_async(swp_entry_t, int); #define read_swap_cache(entry) read_swap_cache_async(entry, 1); +/* linux/mm/oom_kill.c */ +extern int out_of_memory(void); +extern void oom_kill(void); + /* * Make these inline later once they are working properly. */
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Sat Oct 07 2000 - 21:00:19 EST