[PATCH] memcg: fix livelock at oom.

From: KAMEZAWA Hiroyuki
Date: Wed Jul 13 2011 - 22:36:50 EST


867578cb "memcg: fix oom kill behavior" introduced oom_lock counter
which is incremented by mem_cgroup_oom_lock when we are about to handle
memcg OOM situation. mem_cgroup_handle_oom falls back to a sleep if
oom_lock > 1 to prevent from multiple oom kills at the same time.
The counter is then decremented by mem_cgroup_oom_unlock called from the
same function.

This works correctly but it can lead to serious starvations when we
have many processes triggering OOM.

example.

Make a hierarchy of memcg, which has 300MB memory+swap limit.

%cgcreate -g memory:A
%cgset -r memory.use_hierarchy=1 A
%cgset -r memory.limit_in_bytes=300M A
%cgset -r memory.memsw.limit_in_bytes=300M A
%cgcreate -g memory:A/B
%cgcreate -g memory:A/C
%cgcreate -g memory:A/B/X
%cgcreate -g memory:A/B/Y

Then, running folloing program under A/B/X.
%cgexec -g memory:A/B/X ./fork
==
int main(int argc, char *argv[])
{
int i;
int status;

for (i = 0; i < 5000; i++) {
if (fork() == 0) {
char *c;
c = malloc(1024*1024);
memset(c, 0, 1024*1024);
sleep(20);
fprintf(stderr, "[%d]\n", i);
exit(0);
}
printf("%d\n", i);
waitpid(-1, &status, WNOHANG);
}
while (1) {
int ret;
ret = waitpid(-1, &status, WNOHANG);

if (ret == -1)
break;
if (!ret)
sleep(1);
}
return 0;
}
==

This forks a process and the child malloc(1M). Then, after forking 300
childrens, the memcg goes int OOM. Expected behavior is oom-killer
will kill process and make progress. But, 300 children will try to get
oom_lock and oom kill...and the program seems not to make progress.

The reason is that memcg invokes OOM-Kill when the counter oom_lock is 0.
But if many process runs, it never goes down to 0 because it's incremanted
before all processes quits sleep by previous oom-lock, which decremetns
oom_lock.

This patch fixes the behavior. This patch makes the oom-hierarchy should
have only one lock value 1/0. For example, in following hierarchy,

A
/
B
/ \
C D

When C goes into OOM because of limit by B, set B->oom_lock=1
After that, when A goes into OOM because of limit by A,
clear B->oom_lock as 0 and set A->oom_lock=1.

At unlocking, the ancestor which has ()->oom_lock=1 will be cleared.

After this, above program will do fork 5000 procs with 4000+ oom-killer.

Reported-by: Michal Hocko <mhocko@xxxxxxx>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
Changelog:
- fixed under_oom counter reset.
---
mm/memcontrol.c | 77 +++++++++++++++++++++++++++++++++++++-----------------
1 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e..5f9661b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -246,7 +246,8 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- atomic_t oom_lock;
+ int oom_lock;
+ atomic_t under_oom;
atomic_t refcnt;

unsigned int swappiness;
@@ -1801,36 +1802,63 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
}

/*
- * Check OOM-Killer is already running under our hierarchy.
+ * Check whether OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
-static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
{
- int x, lock_count = 0;
struct mem_cgroup *iter;
+ bool ret;

- for_each_mem_cgroup_tree(iter, mem) {
- x = atomic_inc_return(&iter->oom_lock);
- lock_count = max(x, lock_count);
+ /*
+ * If an ancestor (including this memcg) is the owner of OOM Lock.
+ * return false;
+ */
+ for (iter = memcg; iter != NULL; iter = parent_mem_cgroup(iter)) {
+ if (iter->oom_lock)
+ break;
+ if (!iter->use_hierarchy) {
+ iter = NULL;
+ break;
+ }
}

- if (lock_count == 1)
- return true;
- return false;
+ if (iter)
+ return false;
+ /*
+ * Make the owner of OOM lock to be the highest ancestor of hierarchy
+ * under OOM. IOW, move children's OOM owner information to this memcg
+ * if a child is owner. In this case, an OOM killer is running and
+ * we return false. But make this memcg as owner of oom-lock.
+ */
+ ret = true;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter->oom_lock) {
+ iter->oom_lock = 0;
+ ret = false;
+ }
+ atomic_set(&iter->under_oom, 1);
+ }
+ /* Make this memcg as the owner of OOM lock. */
+ memcg->oom_lock = 1;
+ return ret;
}

-static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
- struct mem_cgroup *iter;
+ struct mem_cgroup *iter, *iter2;

- /*
- * When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. We have to use
- * atomic_add_unless() here.
- */
- for_each_mem_cgroup_tree(iter, mem)
- atomic_add_unless(&iter->oom_lock, -1, 0);
- return 0;
+ for (iter = memcg; iter != NULL; iter = parent_mem_cgroup(iter)) {
+ if (iter->oom_lock) {
+ iter->oom_lock = 0;
+ break;
+ }
+ }
+ BUG_ON(!iter);
+
+ for_each_mem_cgroup_tree(iter2, iter)
+ atomic_set(&iter2->under_oom, 0);
+ return;
}


@@ -1875,7 +1903,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)

static void memcg_oom_recover(struct mem_cgroup *mem)
{
- if (mem && atomic_read(&mem->oom_lock))
+ if (mem && atomic_read(&mem->under_oom))
memcg_wakeup_oom(mem);
}

@@ -1916,7 +1944,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
finish_wait(&memcg_oom_waitq, &owait.wait);
}
mutex_lock(&memcg_oom_mutex);
- mem_cgroup_oom_unlock(mem);
+ if (locked)
+ mem_cgroup_oom_unlock(mem);
memcg_wakeup_oom(mem);
mutex_unlock(&memcg_oom_mutex);

@@ -4584,7 +4613,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
list_add(&event->list, &memcg->oom_notify);

/* already in OOM ? */
- if (atomic_read(&memcg->oom_lock))
+ if (atomic_read(&memcg->under_oom))
eventfd_signal(eventfd, 1);
mutex_unlock(&memcg_oom_mutex);

@@ -4619,7 +4648,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);

- if (atomic_read(&mem->oom_lock))
+ if (atomic_read(&mem->under_oom))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
--
1.7.4.1



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/