[v7 3/5] mm, oom: introduce oom_priority for memory cgroups

From: Roman Gushchin
Date: Mon Sep 04 2017 - 10:22:13 EST


Introduce a per-memory-cgroup oom_priority setting: an integer number,
which defines the order in which the OOM killer selects victim memory
cgroups.

OOM killer prefers memory cgroups with larger priority if they are
populated with eligible tasks.

The oom_priority value is compared within sibling cgroups.

If two or more sibling cgroups have the same oom_priority,
the decision is based on their memory footprint.

The root cgroup has the oom_priority 0, which cannot be changed.

Signed-off-by: Roman Gushchin <guro@xxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
Cc: kernel-team@xxxxxx
Cc: cgroups@xxxxxxxxxxxxxxx
Cc: linux-doc@xxxxxxxxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-mm@xxxxxxxxx
---
include/linux/memcontrol.h | 3 +++
mm/memcontrol.c | 49 ++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5b5c2b89968e..73a0291948fd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,6 +206,9 @@ struct mem_cgroup {
/* cached OOM score */
long oom_score;

+ /* OOM killer priority */
+ int oom_priority;
+
/* handle for "memory.events" */
struct cgroup_file events_file;

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 97813c56163b..d7dd293897ca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2757,6 +2757,7 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
for (;;) {
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg = NULL;
+ int prio = INT_MIN;
long score = LONG_MIN;

css_for_each_child(css, &root->css) {
@@ -2768,7 +2769,12 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
if (iter->oom_score == 0)
continue;

- if (iter->oom_score > score) {
+ if (iter->oom_priority > prio) {
+ memcg = iter;
+ prio = iter->oom_priority;
+ score = iter->oom_score;
+ } else if (iter->oom_priority == prio &&
+ iter->oom_score > score) {
memcg = iter;
score = iter->oom_score;
}
@@ -2838,7 +2844,15 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
* For system-wide OOMs we should consider tasks in the root cgroup
* with oom_score larger than oc->chosen_points.
*/
- if (!oc->memcg) {
+ if (!oc->memcg && !(oc->chosen_memcg &&
+ oc->chosen_memcg->oom_priority > 0)) {
+ /*
+ * Root memcg has priority 0, so if chosen memcg has lower
+ * priority, any task in root cgroup is preferable.
+ */
+ if (oc->chosen_memcg && oc->chosen_memcg->oom_priority < 0)
+ oc->chosen_points = 0;
+
select_victim_root_cgroup_task(oc);

if (oc->chosen_task && oc->chosen_memcg) {
@@ -5480,6 +5494,31 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}

+static int memory_oom_priority_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "%d\n", memcg->oom_priority);
+
+ return 0;
+}
+
+static ssize_t memory_oom_priority_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int oom_priority;
+ int err;
+
+ err = kstrtoint(strstrip(buf), 0, &oom_priority);
+ if (err)
+ return err;
+
+ memcg->oom_priority = oom_priority;
+
+ return nbytes;
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5606,6 +5645,12 @@ static struct cftype memory_files[] = {
.write = memory_oom_group_write,
},
{
+ .name = "oom_priority",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_oom_priority_show,
+ .write = memory_oom_priority_write,
+ },
+ {
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_file),
--
2.13.5