[RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.

From: Shawn Landden
Date: Mon Nov 20 2017 - 23:50:12 EST


See my systemd patch: https://github.com/shawnl/systemd/tree/prctl

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).

v2
switch to prctl, memcg support

v3
use <linux/wait.h>
put OOM after constraint checking
---
fs/eventpoll.c | 27 ++++++++++++++++++++
fs/proc/array.c | 7 ++++++
include/linux/memcontrol.h | 3 +++
include/linux/oom.h | 4 +++
include/linux/sched.h | 1 +
include/uapi/linux/prctl.h | 4 +++
kernel/cgroup/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/exit.c | 1 +
kernel/sys.c | 9 +++++++
mm/memcontrol.c | 2 ++
mm/oom_kill.c | 47 +++++++++++++++++++++++++++++++++++
11 files changed, 166 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..745662f9a7e1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,6 +43,8 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <net/busy_poll.h>
+#include <linux/memcontrol.h>
+#include <linux/oom.h>

/*
* LOCKING:
@@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
+ DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
+ DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback);
+
+ if (current->oom_target) {
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *mcg;
+
+ mcg = mem_cgroup_from_task(current);
+ if (mcg)
+ add_wait_queue(&mcg->oom_target, &oom_target_wait_mcg);
+#endif
+ add_wait_queue(oom_target_get_wait(), &oom_target_wait);
+ }

if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
@@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;

+ if (current->oom_target) {
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *mcg;
+
+ mcg = mem_cgroup_from_task(current);
+ if (mcg)
+ remove_wait_queue(&mcg->oom_target,
+ &oom_target_wait_mcg);
+#endif
+ remove_wait_queue(oom_target_get_wait(), &oom_target_wait);
+ }
+
return res;
}

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9390032a11e1..1954ae87cb88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
seq_putc(m, '\n');
}

+static inline void task_idle(struct seq_file *m, struct task_struct *p)
+{
+ seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
+ seq_putc(m, '\n');
+}
+
static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
{
@@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_sig(m, task);
task_cap(m, task);
task_seccomp(m, task);
+ task_idle(m, task);
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 69966c461d1c..02eb92e7eff5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,7 @@
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
+#include <linux/wait.h>

struct mem_cgroup;
struct page;
@@ -261,6 +262,8 @@ struct mem_cgroup {
struct list_head event_list;
spinlock_t event_list_lock;

+ wait_queue_head_t oom_target;
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..88acea9e0a59 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -102,6 +102,10 @@ extern void oom_killer_enable(void);

extern struct task_struct *find_lock_task_mm(struct task_struct *p);

+extern void exit_oom_target(void);
+struct wait_queue_head *oom_target_get_wait(void);
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+
/* sysctls */
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..51b0e5987e8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -652,6 +652,7 @@ struct task_struct {
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
#endif
+ unsigned oom_target:1;

unsigned long atomic_flags; /* Flags requiring atomic access. */

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b640071421f7..94868317c6f2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,4 +198,8 @@ struct prctl_mm_map {
# define PR_CAP_AMBIENT_LOWER 3
# define PR_CAP_AMBIENT_CLEAR_ALL 4

+#define PR_SET_IDLE 48
+#define PR_GET_IDLE 49
+# define PR_IDLE_MODE_KILLME 1
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..081bcd84a8d0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,8 @@
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <net/sock.h>
+#include <linux/oom.h>
+#include <linux/memcontrol.h>

#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
@@ -756,6 +758,9 @@ static void css_set_move_task(struct task_struct *task,
struct css_set *from_cset, struct css_set *to_cset,
bool use_mg_tasks)
{
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *mcg;
+#endif
lockdep_assert_held(&css_set_lock);

if (to_cset && !css_set_populated(to_cset))
@@ -779,6 +784,35 @@ static void css_set_move_task(struct task_struct *task,
css_task_iter_advance(it);

list_del_init(&task->cg_list);
+#ifdef CONFIG_MEMCG
+ /* dequeue from memcg->oom_target
+ * TODO: this is O(n), add rb-tree to make it O(logn)
+ */
+ mcg = mem_cgroup_from_task(task);
+ if (mcg) {
+ struct wait_queue_entry *wait;
+
+ spin_lock(&mcg->oom_target.lock);
+ if (!waitqueue_active(&mcg->oom_target))
+ goto empty_from;
+ wait = list_first_entry(&mcg->oom_target.head,
+ wait_queue_entry_t, entry);
+ do {
+ struct list_head *list;
+
+ if (wait->private == task)
+ __remove_wait_queue(&mcg->oom_target,
+ wait);
+ list = wait->entry.next;
+ if (list_is_last(list, &mcg->oom_target.head))
+ break;
+ wait = list_entry(list,
+ struct wait_queue_entry, entry);
+ } while (1);
+empty_from:
+ spin_unlock(&mcg->oom_target.lock);
+ }
+#endif
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
} else {
@@ -797,6 +831,33 @@ static void css_set_move_task(struct task_struct *task,
rcu_assign_pointer(task->cgroups, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
+#ifdef CONFIG_MEMCG
+ /* dequeue from memcg->oom_target */
+ mcg = mem_cgroup_from_task(task);
+ if (mcg) {
+ struct wait_queue_entry *wait;
+
+ spin_lock(&mcg->oom_target.lock);
+ if (!waitqueue_active(&mcg->oom_target))
+ goto empty_to;
+ wait = list_first_entry(&mcg->oom_target.head,
+ wait_queue_entry_t, entry);
+ do {
+ struct list_head *list;
+
+ if (wait->private == task)
+ __add_wait_queue(&mcg->oom_target,
+ wait);
+ list = wait->entry.next;
+ if (list_is_last(list, &mcg->oom_target.head))
+ break;
+ wait = list_entry(list,
+ struct wait_queue_entry, entry);
+ } while (1);
+empty_to:
+ spin_unlock(&mcg->oom_target.lock);
+ }
+#endif
}
}

diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..2788fbdae267 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
+#include <linux/eventpoll.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..e1eb049a85e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+ case PR_SET_IDLE:
+ if (!((arg2 == 0) || (arg2 == PR_IDLE_MODE_KILLME)))
+ return -EINVAL;
+ me->oom_target = arg2;
+ error = 0;
+ break;
+ case PR_GET_IDLE:
+ error = me->oom_target;
+ break;
default:
error = -EINVAL;
break;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661f046ad318..a4e3b93aeccd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4300,6 +4300,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memory_cgrp_subsys.broken_hierarchy = true;
}

+ init_waitqueue_head(&memcg->oom_target);
+
/* The following stuff does not apply to the root */
if (!parent) {
root_mem_cgroup = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..c5d8f5a716bc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,9 @@
#include <linux/kthread.h>
#include <linux/init.h>
#include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
+#include <linux/wait.h>
+#include <linux/memcontrol.h>

#include <asm/tlb.h>
#include "internal.h"
@@ -54,6 +57,23 @@ int sysctl_oom_dump_tasks = 1;

DEFINE_MUTEX(oom_lock);

+static DECLARE_WAIT_QUEUE_HEAD(oom_target);
+
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+void exit_oom_target(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ remove_wait_queue(&oom_target, &wait);
+}
+
+inline struct wait_queue_head *oom_target_get_wait()
+{
+ return &oom_target;
+}
+
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -994,6 +1014,18 @@ int unregister_oom_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);

+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+ struct task_struct *ts = wait->private;
+
+ /* We use SIGKILL instead of the oom killer
+ * so as to cleanly interrupt ep_poll()
+ */
+ pr_info("Killing pid %u from prctl(PR_SET_IDLE) death row.\n", ts->pid);
+ send_sig(SIGKILL, ts, 1);
+ return 0;
+}
+
/**
* out_of_memory - kill the "best" process when we run out of memory
* @oc: pointer to struct oom_control
@@ -1007,6 +1039,7 @@ bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
enum oom_constraint constraint = CONSTRAINT_NONE;
+ wait_queue_head_t *w;

if (oom_killer_disabled)
return false;
@@ -1056,6 +1089,20 @@ bool out_of_memory(struct oom_control *oc)
return true;
}

+ /*
+ * Check death row for current memcg or global.
+ */
+#ifdef CONFIG_MEMCG
+ if (is_memcg_oom(oc))
+ w = &oc->memcg->oom_target;
+ else
+#endif
+ w = oom_target_get_wait();
+ if (waitqueue_active(w)) {
+ wake_up(w);
+ return true;
+ }
+
select_bad_process(oc);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
--
2.14.1