Re: [PATCH v13 3/7] mm, oom: cgroup-aware OOM killer

From: Roman Gushchin
Date: Thu Dec 07 2017 - 08:40:38 EST


On Wed, Dec 06, 2017 at 05:24:13PM -0800, Andrew Morton wrote:
>
> As a result of the "stalled MM patches" discussion I've dropped these
> three patches:
>
> mm,oom: move last second allocation to inside the OOM killer
> mm,oom: use ALLOC_OOM for OOM victim's last second allocation
> mm,oom: remove oom_lock serialization from the OOM reaper
>
> and I had to rework this patch as a result. Please carefully check (and
> preferable test) my handiwork in out_of_memory()?

Hi, Andrew!

Reviewed and tested, looks good to me. Thank you!

A couple of small nits below.

>
>
>
> From: Roman Gushchin <guro@xxxxxx>
> Subject: mm, oom: cgroup-aware OOM killer
>
> Traditionally, the OOM killer is operating on a process level. Under oom
> conditions, it finds a process with the highest oom score and kills it.
>
> This behavior doesn't suit well the system with many running containers:
>
> 1) There is no fairness between containers. A small container with few
> large processes will be chosen over a large one with huge number of
> small processes.
>
> 2) Containers often do not expect that some random process inside will
> be killed. In many cases much safer behavior is to kill all tasks in
> the container. Traditionally, this was implemented in userspace, but
> doing it in the kernel has some advantages, especially in a case of a
> system-wide OOM.
>
> To address these issues, the cgroup-aware OOM killer is introduced.
>
> This patch introduces the core functionality: an ability to select a
> memory cgroup as an OOM victim. Under OOM conditions the OOM killer looks
> for the biggest leaf memory cgroup and kills the biggest task belonging to
> it.
>
> The following patches will extend this functionality to consider non-leaf
> memory cgroups as OOM victims, and also provide an ability to kill all
> tasks belonging to the victim cgroup.
>
> The root cgroup is treated as a leaf memory cgroup, so it's score is
> compared with other leaf memory cgroups. Due to memcg statistics
> implementation a special approximation is used for estimating oom_score of
> root memory cgroup: we sum oom_score of the belonging processes (or, to be
> more precise, tasks owning their mm structures).
>
> Link: http://lkml.kernel.org/r/20171130152824.1591-4-guro@xxxxxx
> Signed-off-by: Roman Gushchin <guro@xxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
> Cc: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> Cc: Tejun Heo <tj@xxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> ---
>
> include/linux/memcontrol.h | 17 +++
> include/linux/oom.h | 12 ++
> mm/memcontrol.c | 181 +++++++++++++++++++++++++++++++++++
> mm/oom_kill.c | 72 ++++++++++---
> 4 files changed, 262 insertions(+), 20 deletions(-)
>
> diff -puN include/linux/memcontrol.h~mm-oom-cgroup-aware-oom-killer include/linux/memcontrol.h
> --- a/include/linux/memcontrol.h~mm-oom-cgroup-aware-oom-killer
> +++ a/include/linux/memcontrol.h
> @@ -35,6 +35,7 @@ struct mem_cgroup;
> struct page;
> struct mm_struct;
> struct kmem_cache;
> +struct oom_control;
>
> /* Cgroup-specific page state, on top of universal node page state */
> enum memcg_stat_item {
> @@ -344,6 +345,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
> return css ? container_of(css, struct mem_cgroup, css) : NULL;
> }
>
> +static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> +{
> + css_put(&memcg->css);
> +}
> +
> #define mem_cgroup_from_counter(counter, member) \
> container_of(counter, struct mem_cgroup, member)
>
> @@ -482,6 +488,8 @@ static inline bool task_in_memcg_oom(str
>
> bool mem_cgroup_oom_synchronize(bool wait);
>
> +bool mem_cgroup_select_oom_victim(struct oom_control *oc);
> +
> #ifdef CONFIG_MEMCG_SWAP
> extern int do_swap_account;
> #endif
> @@ -781,6 +789,10 @@ static inline bool task_in_mem_cgroup(st
> return true;
> }
>
> +static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> +{
> +}
> +
> static inline struct mem_cgroup *
> mem_cgroup_iter(struct mem_cgroup *root,
> struct mem_cgroup *prev,
> @@ -973,6 +985,11 @@ static inline
> void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
> {
> }
> +
> +static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc)
> +{
> + return false;
> +}
> #endif /* CONFIG_MEMCG */
>
> /* idx can be of type enum memcg_stat_item or node_stat_item */
> diff -puN include/linux/oom.h~mm-oom-cgroup-aware-oom-killer include/linux/oom.h
> --- a/include/linux/oom.h~mm-oom-cgroup-aware-oom-killer
> +++ a/include/linux/oom.h
> @@ -10,6 +10,13 @@
> #include <linux/sched/coredump.h> /* MMF_* */
> #include <linux/mm.h> /* VM_FAULT* */
>
> +
> +/*
> + * Special value returned by victim selection functions to indicate
> + * that are inflight OOM victims.
> + */
> +#define INFLIGHT_VICTIM ((void *)-1UL)
> +
> struct zonelist;
> struct notifier_block;
> struct mem_cgroup;
> @@ -40,7 +47,8 @@ struct oom_control {
>
> /* Used by oom implementation, do not set */
> unsigned long totalpages;
> - struct task_struct *chosen;
> + struct task_struct *chosen_task;
> + struct mem_cgroup *chosen_memcg;
> unsigned long chosen_points;
> };
>
> @@ -102,6 +110,8 @@ extern void oom_killer_enable(void);
>
> extern struct task_struct *find_lock_task_mm(struct task_struct *p);
>
> +extern int oom_evaluate_task(struct task_struct *task, void *arg);
> +
> /* sysctls */
> extern int sysctl_oom_dump_tasks;
> extern int sysctl_oom_kill_allocating_task;
> diff -puN mm/memcontrol.c~mm-oom-cgroup-aware-oom-killer mm/memcontrol.c
> --- a/mm/memcontrol.c~mm-oom-cgroup-aware-oom-killer
> +++ a/mm/memcontrol.c
> @@ -2664,6 +2664,187 @@ static inline bool memcg_has_children(st
> return ret;
> }
>
> +static long memcg_oom_badness(struct mem_cgroup *memcg,
> + const nodemask_t *nodemask,
> + unsigned long totalpages)
> +{
> + long points = 0;
> + int nid;
> + pg_data_t *pgdat;
> +
> + for_each_node_state(nid, N_MEMORY) {
> + if (nodemask && !node_isset(nid, *nodemask))
> + continue;
> +
> + points += mem_cgroup_node_nr_lru_pages(memcg, nid,
> + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
> +
> + pgdat = NODE_DATA(nid);
> + points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
> + NR_SLAB_UNRECLAIMABLE);
> + }
> +
> + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
> + (PAGE_SIZE / 1024);
> + points += memcg_page_state(memcg, MEMCG_SOCK);
> + points += memcg_page_state(memcg, MEMCG_SWAP);
> +
> + return points;
> +}
> +
> +/*
> + * Checks if the given memcg is a valid OOM victim and returns a number,
> + * which means the folowing:
> + * -1: there are inflight OOM victim tasks, belonging to the memcg
> + * 0: memcg is not eligible, e.g. all belonging tasks are protected
> + * by oom_score_adj set to OOM_SCORE_ADJ_MIN
> + * >0: memcg is eligible, and the returned value is an estimation
> + * of the memory footprint
> + */
> +static long oom_evaluate_memcg(struct mem_cgroup *memcg,
> + const nodemask_t *nodemask,
> + unsigned long totalpages)
> +{
> + struct css_task_iter it;
> + struct task_struct *task;
> + int eligible = 0;
> +
> + /*
> + * Root memory cgroup is a special case:
> + * we don't have necessary stats to evaluate it exactly as
> + * leaf memory cgroups, so we approximate it's oom_score
> + * by summing oom_score of all belonging tasks, which are
> + * owners of their mm structs.
> + *
> + * If there are inflight OOM victim tasks inside
> + * the root memcg, we return -1.
> + */
> + if (memcg == root_mem_cgroup) {
> + struct css_task_iter it;
> + struct task_struct *task;
> + long score = 0;
> +
> + css_task_iter_start(&memcg->css, 0, &it);
> + while ((task = css_task_iter_next(&it))) {
> + if (tsk_is_oom_victim(task) &&
> + !test_bit(MMF_OOM_SKIP,
> + &task->signal->oom_mm->flags)) {
> + score = -1;
> + break;
> + }
> +
> + task_lock(task);
> + if (!task->mm || task->mm->owner != task) {
> + task_unlock(task);
> + continue;
> + }
> + task_unlock(task);
> +
> + score += oom_badness(task, memcg, nodemask,
> + totalpages);
> + }
> + css_task_iter_end(&it);
> +
> + return score;
> + }
> +
> + /*
> + * Memcg is OOM eligible if there are OOM killable tasks inside.
> + *
> + * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN
> + * as unkillable.
> + *
> + * If there are inflight OOM victim tasks inside the memcg,
> + * we return -1.
> + */
> + css_task_iter_start(&memcg->css, 0, &it);
> + while ((task = css_task_iter_next(&it))) {
> + if (!eligible &&
> + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)
> + eligible = 1;
> +
> + if (tsk_is_oom_victim(task) &&
> + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
> + eligible = -1;
> + break;
> + }
> + }
> + css_task_iter_end(&it);
> +
> + if (eligible <= 0)
> + return eligible;
> +
> + return memcg_oom_badness(memcg, nodemask, totalpages);
> +}
> +
> +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
> +{
> + struct mem_cgroup *iter;
> +
> + oc->chosen_memcg = NULL;
> + oc->chosen_points = 0;
> +
> + /*
> + * The oom_score is calculated for leaf memory cgroups (including
> + * the root memcg).
> + */
> + rcu_read_lock();
> + for_each_mem_cgroup_tree(iter, root) {
> + long score;
> +
> + if (memcg_has_children(iter) && iter != root_mem_cgroup)
> + continue;
> +
> + score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages);
> +
> + /*
> + * Ignore empty and non-eligible memory cgroups.
> + */
> + if (score == 0)
> + continue;
> +
> + /*
> + * If there are inflight OOM victims, we don't need
> + * to look further for new victims.
> + */
> + if (score == -1) {
> + oc->chosen_memcg = INFLIGHT_VICTIM;
> + mem_cgroup_iter_break(root, iter);
> + break;
> + }
> +
> + if (score > oc->chosen_points) {
> + oc->chosen_points = score;
> + oc->chosen_memcg = iter;
> + }
> + }
> +
> + if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM)
> + css_get(&oc->chosen_memcg->css);
> +
> + rcu_read_unlock();
> +}
> +
> +bool mem_cgroup_select_oom_victim(struct oom_control *oc)
> +{
> + struct mem_cgroup *root;
> +
> + if (mem_cgroup_disabled())
> + return false;
> +
> + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> + return false;
> +
> + if (oc->memcg)
> + root = oc->memcg;
> + else
> + root = root_mem_cgroup;
> +
> + select_victim_memcg(root, oc);
> +
> + return oc->chosen_memcg;
> +}
> +
> /*
> * Reclaims as many pages from the given memcg as possible.
> *
> diff -puN mm/oom_kill.c~mm-oom-cgroup-aware-oom-killer mm/oom_kill.c
> --- a/mm/oom_kill.c~mm-oom-cgroup-aware-oom-killer
> +++ a/mm/oom_kill.c
> @@ -309,7 +309,7 @@ static enum oom_constraint constrained_a
> return CONSTRAINT_NONE;
> }
>
> -static int oom_evaluate_task(struct task_struct *task, void *arg)
> +int oom_evaluate_task(struct task_struct *task, void *arg)
> {
> struct oom_control *oc = arg;
> unsigned long points;
> @@ -343,26 +343,26 @@ static int oom_evaluate_task(struct task
> goto next;
>
> /* Prefer thread group leaders for display purposes */
> - if (points == oc->chosen_points && thread_group_leader(oc->chosen))
> + if (points == oc->chosen_points && thread_group_leader(oc->chosen_task))
> goto next;
> select:
> - if (oc->chosen)
> - put_task_struct(oc->chosen);
> + if (oc->chosen_task)
> + put_task_struct(oc->chosen_task);
> get_task_struct(task);
> - oc->chosen = task;
> + oc->chosen_task = task;
> oc->chosen_points = points;
> next:
> return 0;
> abort:
> - if (oc->chosen)
> - put_task_struct(oc->chosen);
> - oc->chosen = (void *)-1UL;
> + if (oc->chosen_task)
> + put_task_struct(oc->chosen_task);
> + oc->chosen_task = INFLIGHT_VICTIM;
> return 1;
> }
>
> /*
> * Simple selection loop. We choose the process with the highest number of
> - * 'points'. In case scan was aborted, oc->chosen is set to -1.
> + * 'points'. In case scan was aborted, oc->chosen_task is set to -1.
> */
> static void select_bad_process(struct oom_control *oc)
> {
> @@ -912,7 +912,7 @@ static void __oom_kill_process(struct ta
>
> static void oom_kill_process(struct oom_control *oc, const char *message)
> {
> - struct task_struct *p = oc->chosen;
> + struct task_struct *p = oc->chosen_task;
> unsigned int points = oc->chosen_points;
> struct task_struct *victim = p;
> struct task_struct *child;
> @@ -973,6 +973,27 @@ static void oom_kill_process(struct oom_
> __oom_kill_process(victim);
> }
>
> +static bool oom_kill_memcg_victim(struct oom_control *oc)
> +{
> +
> + if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM)
> + return oc->chosen_memcg;
> +
> + /* Kill a task in the chosen memcg with the biggest memory footprint */
> + oc->chosen_points = 0;
> + oc->chosen_task = NULL;
> + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);
> +
> + if (oc->chosen_task == NULL || oc->chosen_task == INFLIGHT_VICTIM)
> + goto out;
> +
> + __oom_kill_process(oc->chosen_task);
> +
> +out:
> + mem_cgroup_put(oc->chosen_memcg);
> + return oc->chosen_task;
> +}
> +
> /*
> * Determines whether the kernel must panic because of the panic_on_oom sysctl.
> */
> @@ -1025,6 +1046,7 @@ bool out_of_memory(struct oom_control *o
> {
> unsigned long freed = 0;
> enum oom_constraint constraint = CONSTRAINT_NONE;
> + bool delay = false; /* if set, delay next allocation attempt */
>
> if (oom_killer_disabled)
> return false;
> @@ -1069,27 +1091,39 @@ bool out_of_memory(struct oom_control *o
> current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
> current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> get_task_struct(current);
> - oc->chosen = current;
> + oc->chosen_task = current;
> oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
> return true;
> }
>
> + if (mem_cgroup_select_oom_victim(oc)) {
> + if (oom_kill_memcg_victim(oc)) {
> + delay = true;
> + goto out;
> + }
> + }

It's probably better to join two conditions with &&:
if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
delay = true;
goto out;
}


> +
> select_bad_process(oc);
> /* Found nothing?!?! Either we hang forever, or we panic. */
> - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> + if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> dump_header(oc, NULL);
> panic("Out of memory and no killable processes...\n");
> }
> - if (oc->chosen && oc->chosen != (void *)-1UL) {
> + if (oc->chosen_task && oc->chosen_task != (void *)-1UL) {

It's better to replace "(void *)-1UL" here with "INFLIGHT_VICTIM"
to be consistent:
if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) ...

> oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
> "Memory cgroup out of memory");

--

Also, I've sent a couple of small fixes, requested by Michal:

1) https://lkml.org/lkml/2017/12/1/569
(mm, oom, docs: document groupoom mount option)

2) https://lkml.org/lkml/2017/12/1/568
(mm, oom: return error on access to memory.oom_group if groupoom is disabled)

Can, you, please, pull them?


Thank you!


Roman