[PATCH] oom: don't ignore temporary rss while execve

From: KOSAKI Motohiro
Date: Thu Aug 19 2010 - 07:40:20 EST


execve() makes new mm struct and setup stack and argv vector,
Unfortunately this new mm is not pointed any tasks, then oom-kill
can't detect this memory usage. therefore oom-kill may kill incorrect
task.

This patch added in-exec rss treatness to oom.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
---
fs/compat.c | 8 ++++++--
fs/exec.c | 19 +++++++++++++++++--
include/linux/binfmts.h | 1 +
include/linux/sched.h | 1 +
mm/oom_kill.c | 36 +++++++++++++++++++++++++++---------
5 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 718c706..643140c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1527,6 +1527,7 @@ int compat_do_execve(char * filename,
retval = bprm_mm_init(bprm);
if (retval)
goto out_file;
+ set_exec_mm(bprm->mm);

bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
@@ -1560,6 +1561,7 @@ int compat_do_execve(char * filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ set_exec_mm(NULL);
acct_update_integrals(current);
free_bprm(bprm);
if (displaced)
@@ -1567,8 +1569,10 @@ int compat_do_execve(char * filename,
return retval;

out:
- if (bprm->mm)
- mmput(bprm->mm);
+ if (current->in_exec_mm) {
+ struct mm_struct *in_exec_mm = set_exec_mm(NULL);
+ mmput (in_exec_mm);
+ }

out_file:
if (bprm->file) {
diff --git a/fs/exec.c b/fs/exec.c
index 2d94552..85192e1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1314,6 +1314,17 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)

EXPORT_SYMBOL(search_binary_handler);

+struct mm_struct* set_exec_mm(struct mm_struct *mm)
+{
+ struct mm_struct *old = current->in_exec_mm;
+
+ task_lock(current);
+ current->in_exec_mm = mm;
+ task_unlock(current);
+
+ return old;
+}
+
/*
* sys_execve() executes a new program.
*/
@@ -1361,6 +1372,7 @@ int do_execve(const char * filename,
retval = bprm_mm_init(bprm);
if (retval)
goto out_file;
+ set_exec_mm(bprm->mm);

bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
@@ -1395,6 +1407,7 @@ int do_execve(const char * filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ set_exec_mm(NULL);
acct_update_integrals(current);
free_bprm(bprm);
if (displaced)
@@ -1402,8 +1415,10 @@ int do_execve(const char * filename,
return retval;

out:
- if (bprm->mm)
- mmput (bprm->mm);
+ if (current->in_exec_mm) {
+ struct mm_struct *in_exec_mm = set_exec_mm(NULL);
+ mmput (in_exec_mm);
+ }

out_file:
if (bprm->file) {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index a065612..8cf61eb 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -133,6 +133,7 @@ extern void install_exec_creds(struct linux_binprm *bprm);
extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
extern void set_binfmt(struct linux_binfmt *new);
extern void free_bprm(struct linux_binprm *);
+extern struct mm_struct* set_exec_mm(struct mm_struct *mm);

#endif /* __KERNEL__ */
#endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..d413757 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1217,6 +1217,7 @@ struct task_struct {
struct plist_node pushable_tasks;

struct mm_struct *mm, *active_mm;
+ struct mm_struct *in_exec_mm;
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 57c05f7..7fc6916 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -120,6 +120,30 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
return NULL;
}

+static unsigned long calculate_rss_swap(struct task_struct *p)
+{
+ struct task_struct *t = p;
+ int mm_accounted = 0;
+ unsigned long points = 0;
+
+ do {
+ task_lock(t);
+ if (!mm_accounted && t->mm) {
+ points += get_mm_rss(t->mm);
+ points += get_mm_counter(t->mm, MM_SWAPENTS);
+ mm_accounted = 1;
+ }
+ if (t->in_exec_mm) {
+ points += get_mm_rss(t->in_exec_mm);
+ points += get_mm_counter(t->in_exec_mm, MM_SWAPENTS);
+ }
+ task_unlock(t);
+ } while_each_thread(p, t);
+
+ return points;
+}
+
+
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
const nodemask_t *nodemask)
@@ -157,16 +181,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
if (oom_unkillable_task(p, mem, nodemask))
return 0;

- p = find_lock_task_mm(p);
- if (!p)
- return 0;
-
/*
* Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
* need to be executed for something that cannot be killed.
*/
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- task_unlock(p);
return 0;
}

@@ -175,7 +194,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
* priority for oom killing.
*/
if (p->flags & PF_OOM_ORIGIN) {
- task_unlock(p);
return 1000;
}

@@ -190,9 +208,9 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
* The baseline for the badness score is the proportion of RAM that each
* task's rss and swap space use.
*/
- points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
- totalpages;
- task_unlock(p);
+ points = calculate_rss_swap(p) * 1000 / totalpages;
+ if (!points)
+ return 0;

/*
* Root processes get 3% bonus, just like the __vm_enough_memory()
--
1.6.5.2





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/