[PATCH] graceful out of memory handling

From: Bernd Paysan (bernd.paysan@gmx.de)
Date: Sun Feb 13 2000 - 12:28:20 EST

Next message: Ralf Baechle: "Re: Gigabit Linux Server Bottlenecks"
Previous message: Larry McVoy: "Re: Scheduled Transfer Protocol on Linux"
Next in thread: Rik van Riel: "Re: [PATCH] graceful out of memory handling"
Reply: Rik van Riel: "Re: [PATCH] graceful out of memory handling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Hi!

I've been using Rik van Riel's OOM patch for quite a time, and kept it up
to date with the 2.3.x kernels. I added a /proc/sys/vm entry to configure
it (out-of-memory-kill: limit-to-kill limit-to-term, in percentage of
overall free memory). I also moved whatever possible into
linux/mm/oom_kill.c to keep the impact of other changes as little as
possible (mostly to reducy my work when the kernel changes again). Other
parts of the kernel can make use of the functions to select a process to
kill (e.g. under hard OOM conditions). I didn't put in that, it's
sufficient to let kswapd do the work.

I must say I need it. My programs go wild too often. This patch selects
the worst offender and kills it. There are two limits: one to send a
SIGTERM, one to send a SIGKILL. So far, it hasn't caused any false kills
with me. I tried to live without it, and the rest of OOM handling in the
kernel just doesn't do it. An OOM situation without this patch still
results too in a machine that needs a hard reset.

If you don't like it, switch it off with

echo 0 0 >/proc/sys/vm/out-of-memory-kill

as root.

Bernd Paysan
"If you want it done right, you have to do it yourself"
http://www.jwdt.com/~paysan/

--- linux-2.3.43/include/linux/sysctl.h.old Sun Feb 13 17:06:32 2000
+++ linux/include/linux/sysctl.h Sun Feb 13 17:08:53 2000
@@ -126,7 +126,8 @@
         VM_PAGECACHE=7, /* struct: Set cache memory thresholds */
         VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */
         VM_PGT_CACHE=9, /* struct: Set page table cache parameters */
- VM_PAGE_CLUSTER=10 /* int: set number of pages to swap together */
+ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */
+ VM_OOM_KILL=11 /* struct: Control oom_kill behaviour */
};

--- linux-2.3.43/kernel/sysctl.c.old Sun Feb 13 16:57:30 2000
+++ linux/kernel/sysctl.c Sun Feb 13 17:28:59 2000
@@ -38,7 +38,7 @@
/* External variables not in a header file. */
extern int panic_timeout;
extern int console_loglevel, C_A_D;
-extern int bdf_prm[], bdflush_min[], bdflush_max[];
+extern int bdf_prm[], bdflush_min[], bdflush_max[], vm_kill[];
extern int sysctl_overcommit_memory;
extern int max_threads;
extern int nr_queued_signals, max_queued_signals;
@@ -262,6 +262,8 @@
          &pgt_cache_water, 2*sizeof(int), 0600, NULL, &proc_dointvec},
         {VM_PAGE_CLUSTER, "page-cluster",
          &page_cluster, sizeof(int), 0600, NULL, &proc_dointvec},
+ {VM_OOM_KILL, "out-of-memory-kill",
+ &vm_kill, 2*sizeof(int), 0644, NULL, &proc_dointvec},
         {0}
};

--- linux-2.3.43/mm/vmscan.c.old Sun Feb 13 17:25:15 2000
+++ linux/mm/vmscan.c Sun Feb 13 14:21:31 2000
@@ -470,6 +470,7 @@
int kswapd(void *unused)
{
         struct task_struct *tsk = current;
+ int state = 0;

         tsk->session = 1;
         tsk->pgrp = 1;
@@ -500,12 +501,15 @@
                  * up on a more timely basis.
                  */
                 do {
+ extern int oom_try_kill(int);
                         /* kswapd is critical to provide GFP_ATOMIC
                            allocations (not GFP_HIGHMEM ones). */
                         if (nr_free_pages() - nr_free_highpages() >= freepages.high)
                                 break;
                         if (!do_try_to_free_pages(GFP_KSWAPD, 0))
                                 break;
+ state = oom_try_kill(state);
+
                         run_task_queue(&tq_disk);
                 } while (!tsk->need_resched);

--- linux-2.3.43/mm/oom_kill.c.old Sun Feb 13 14:18:15 2000
+++ linux/mm/oom_kill.c Sun Feb 13 17:32:22 2000
@@ -0,0 +1,195 @@
+/*
+ * linux/mm/oom_kill.c
+ *
+ * Copyright (C) 1998 Rik van Riel
+ * Copyright (C) 2000 Bernd Paysan
+ * Thanks go out to Claus Fischer for some serious inspiration and
+ * for goading me into coding this file...
+ *
+ * The routines in this file are used to kill a process when
+ * we're seriously out of memory. This gets called from kswapd()
+ * in linux/mm/vmscan.c when we really Run out of memory.
+ *
+ * Since we won't call these routines often (on a well-configured
+ * machine) this file will double as a 'coding guide' and a signpost
+ * for newbie kernel hackers. It features several pointers to major
+ * kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/timex.h>
+
+#undef DEBUG
+/*
+ * Wow, black magic :) [read closely, the TCP code is hairier]
+ */
+inline int int_sqrt(unsigned int x)
+{
+ unsigned int out = x;
+ while (x & ~(unsigned int)1) x >>=2, out >>=1;
+ if (x) out -= out >> 2;
+ return (out ? out : 1);
+}
+
+/*
+ * Basically, points = size / (sqrt(CPU_used) * sqrt(sqrt(time_running)))
+ * with some bonusses/penalties.
+ *
+ * The definition of the task_struct, the structure describing the state
+ * of each process, can be found in include/linux/sched.h. For
+ * capability info, you should read include/linux/capability.h.
+ */
+
+inline int badness(struct task_struct *p)
+{
+ int points = p->mm ? p->mm->total_vm : 0;
+ points /= int_sqrt((p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3));
+ points /= int_sqrt(int_sqrt((jiffies - p->start_time) >> (SHIFT_HZ + 10)));
+/*
+ * DEF_PRIORITY is the lenght of the standard process priority;
+ * see include/linux/sched.h for more info.
+ */
+ if (p->priority < DEF_PRIORITY)
+ points <<= 1;
+/*
+ * p->(e)uid is the process User ID, ID 0 is root, the super user. Since
+ * the super user can do anything, and does almost nothing (on a proper
+ * system), we have to assume that the process is trusted/good.
+ * Besides, the super user usually runs important system services, which
+ * we don't want to kill...
+ */
+ if (p->uid == 0 || p->euid == 0 || p->cap_effective & CAP_TO_MASK(CAP_SYS_ADMIN))
+ points >>= 2;
+/*
+ * NEVER, EVER kill a process with direct hardware access. Since
+ * they function almost as a device driver, killing one of those
+ * might hang the system -- which is something we need to prevent
+ * at all cost...
+ */
+ if (p->cap_effective & CAP_TO_MASK(CAP_SYS_RAWIO)
+ )
+ points = 0;
+#ifdef DEBUG
+ printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
+ p->pid, p->comm, points);
+#endif
+ return points;
+}
+
+inline struct task_struct * select_bad_process(void)
+{
+ int points = 0, maxpoints = 0;
+ struct task_struct *p;
+ struct task_struct *chosen = NULL;
+/*
+ * These locks are used to prevent modification of critical
+ * structures while we're working with them. Remember that
+ * Linux is a multitasking (and sometimes SMP) system.
+ * -- Luckily these nice macros are made available so we don't
+ * have to do cumbersome locking ourselves :)
+ */
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ {
+ if (p->pid)
+ points = badness(p);
+ if (points > maxpoints) {
+ chosen = p;
+ maxpoints = points;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return chosen;
+}
+
+/*
+ * The SCHED_FIFO magic should make sure that the killed context
+ * gets absolute priority when killing itself. This should prevent
+ * a looping kswapd from interfering with the process killing.
+ * Read kernel/sched.c::goodness() and kernel/sched.c::schedule()
+ * for more info.
+ */
+void oom_kill(int sig)
+{
+
+ struct task_struct *p = select_bad_process();
+ if (p == NULL)
+ return;
+ printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+ force_sig(sig, p);
+ return;
+}
+
+/*
+ * These are percentages. Memory is low if the free amount of memory
+ * is less than said percentage of total space. I made the swap
+ * percentage 2 resp. 4 percent, which should be enough. You can
+ * configure it with /proc/sys/vm/out-of-memory-kill. -- Bernd.
+ */
+
+int vm_kill[2] = {2, 4};
+
+/*
+ * Are we out of memory?
+ *
+ * We ignore swap cache pages and simplify the situation a bit.
+ * This won't do any damage, because we're only called when kswapd
+ * is already failing to free pages and when that is happening we
+ * can assume that the swap cache is very small. See the test in
+ * mm/vmscan.c::kswapd() for more info.
+ */
+
+inline int oom_free(struct sysinfo *val)
+{
+ long free_vm;
+
+ free_vm = (val->freeram + val->bufferram + val->freeswap);
+ free_vm += atomic_read(&page_cache_size);
+
+ return free_vm;
+}
+
+inline int oom_quit_limit(int total)
+{
+ return vm_kill[1] * total;
+}
+
+inline int oom_kill_limit(int total)
+{
+ return vm_kill[0] * total;
+}
+
+int oom_try_kill(int state)
+{
+ struct sysinfo val;
+ int free, total, quit_limit, kill_limit, limit;
+
+ si_meminfo(&val);
+ si_swapinfo(&val);
+
+ free = oom_free(&val);
+ total = val.totalram + val.totalswap;
+ quit_limit = oom_quit_limit(total);
+ kill_limit = oom_kill_limit(total);
+ limit = state ? kill_limit : quit_limit;
+
+#ifdef DEBUG
+ printk(KERN_INFO "OOM: check for kill: %d %d %d %s\n",
+ free, quit_limit, kill_limit, state ? "hard" : "soft");
+#endif
+
+ if((((long long) free) * 100) < ((long long) limit)) {
+ printk(KERN_ERR "OOM: have to kill someone: %d %d %d %s\n",
+ free, quit_limit, kill_limit, state ? "hard" : "soft");
+ oom_kill(state ? SIGKILL : SIGTERM);
+ return 1;
+ } else if((((long long) free) * 100) > ((long long) quit_limit)) {
+ return 0;
+ }
+
+ return state;
+}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/

Next message: Ralf Baechle: "Re: Gigabit Linux Server Bottlenecks"
Previous message: Larry McVoy: "Re: Scheduled Transfer Protocol on Linux"
Next in thread: Rik van Riel: "Re: [PATCH] graceful out of memory handling"
Reply: Rik van Riel: "Re: [PATCH] graceful out of memory handling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Tue Feb 15 2000 - 21:00:25 EST