Re: [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.
From: Shawn Landden
Date: Mon Nov 20 2017 - 23:56:50 EST
On Mon, Nov 20, 2017 at 8:49 PM, Shawn Landden <slandden@xxxxxxxxx> wrote:
> See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
>
> Android uses this memory model for all programs, and having it in the
> kernel will enable integration with the page cache (not in this
> series).
>
> v2
> switch to prctl, memcg support
>
> v3
> use <linux/wait.h>
> put OOM after constraint checking
> ---
> fs/eventpoll.c | 27 ++++++++++++++++++++
> fs/proc/array.c | 7 ++++++
> include/linux/memcontrol.h | 3 +++
> include/linux/oom.h | 4 +++
> include/linux/sched.h | 1 +
> include/uapi/linux/prctl.h | 4 +++
> kernel/cgroup/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++
> kernel/exit.c | 1 +
> kernel/sys.c | 9 +++++++
> mm/memcontrol.c | 2 ++
> mm/oom_kill.c | 47 +++++++++++++++++++++++++++++++++++
> 11 files changed, 166 insertions(+)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 2fabd19cdeea..745662f9a7e1 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -43,6 +43,8 @@
> #include <linux/compat.h>
> #include <linux/rculist.h>
> #include <net/busy_poll.h>
> +#include <linux/memcontrol.h>
> +#include <linux/oom.h>
>
> /*
> * LOCKING:
> @@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
> u64 slack = 0;
> wait_queue_entry_t wait;
> ktime_t expires, *to = NULL;
> + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
> + DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback);
> +
> + if (current->oom_target) {
> +#ifdef CONFIG_MEMCG
> + struct mem_cgroup *mcg;
> +
> + mcg = mem_cgroup_from_task(current);
> + if (mcg)
> + add_wait_queue(&mcg->oom_target, &oom_target_wait_mcg);
> +#endif
> + add_wait_queue(oom_target_get_wait(), &oom_target_wait);
> + }
>
> if (timeout > 0) {
> struct timespec64 end_time = ep_set_mstimeout(timeout);
> @@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
> !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
> goto fetch_events;
>
> + if (current->oom_target) {
> +#ifdef CONFIG_MEMCG
> + struct mem_cgroup *mcg;
> +
> + mcg = mem_cgroup_from_task(current);
> + if (mcg)
> + remove_wait_queue(&mcg->oom_target,
> + &oom_target_wait_mcg);
> +#endif
> + remove_wait_queue(oom_target_get_wait(), &oom_target_wait);
> + }
> +
> return res;
> }
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 9390032a11e1..1954ae87cb88 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
> seq_putc(m, '\n');
> }
>
> +static inline void task_idle(struct seq_file *m, struct task_struct *p)
> +{
> + seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
> + seq_putc(m, '\n');
> +}
> +
> static inline void task_context_switch_counts(struct seq_file *m,
> struct task_struct *p)
> {
> @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
> task_sig(m, task);
> task_cap(m, task);
> task_seccomp(m, task);
> + task_idle(m, task);
> task_cpus_allowed(m, task);
> cpuset_task_status_allowed(m, task);
> task_context_switch_counts(m, task);
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 69966c461d1c..02eb92e7eff5 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -30,6 +30,7 @@
> #include <linux/vmstat.h>
> #include <linux/writeback.h>
> #include <linux/page-flags.h>
> +#include <linux/wait.h>
>
> struct mem_cgroup;
> struct page;
> @@ -261,6 +262,8 @@ struct mem_cgroup {
> struct list_head event_list;
> spinlock_t event_list_lock;
>
> + wait_queue_head_t oom_target;
> +
> struct mem_cgroup_per_node *nodeinfo[0];
> /* WARNING: nodeinfo must be the last member here */
> };
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 01c91d874a57..88acea9e0a59 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -102,6 +102,10 @@ extern void oom_killer_enable(void);
>
> extern struct task_struct *find_lock_task_mm(struct task_struct *p);
>
> +extern void exit_oom_target(void);
> +struct wait_queue_head *oom_target_get_wait(void);
> +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
> +
> /* sysctls */
> extern int sysctl_oom_dump_tasks;
> extern int sysctl_oom_kill_allocating_task;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index fdf74f27acf1..51b0e5987e8c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -652,6 +652,7 @@ struct task_struct {
> /* disallow userland-initiated cgroup migration */
> unsigned no_cgroup_migration:1;
> #endif
> + unsigned oom_target:1;
>
> unsigned long atomic_flags; /* Flags requiring atomic access. */
>
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index b640071421f7..94868317c6f2 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -198,4 +198,8 @@ struct prctl_mm_map {
> # define PR_CAP_AMBIENT_LOWER 3
> # define PR_CAP_AMBIENT_CLEAR_ALL 4
>
> +#define PR_SET_IDLE 48
> +#define PR_GET_IDLE 49
> +# define PR_IDLE_MODE_KILLME 1
> +
> #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 44857278eb8a..081bcd84a8d0 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -55,6 +55,8 @@
> #include <linux/nsproxy.h>
> #include <linux/file.h>
> #include <net/sock.h>
> +#include <linux/oom.h>
> +#include <linux/memcontrol.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/cgroup.h>
> @@ -756,6 +758,9 @@ static void css_set_move_task(struct task_struct *task,
> struct css_set *from_cset, struct css_set *to_cset,
> bool use_mg_tasks)
> {
> +#ifdef CONFIG_MEMCG
> + struct mem_cgroup *mcg;
> +#endif
> lockdep_assert_held(&css_set_lock);
>
> if (to_cset && !css_set_populated(to_cset))
> @@ -779,6 +784,35 @@ static void css_set_move_task(struct task_struct *task,
> css_task_iter_advance(it);
>
> list_del_init(&task->cg_list);
> +#ifdef CONFIG_MEMCG
> + /* dequeue from memcg->oom_target
Ahh this is all shitty here. Sorry for the noise of this shit.
> + * TODO: this is O(n), add rb-tree to make it O(logn)
> + */
> + mcg = mem_cgroup_from_task(task);
> + if (mcg) {
> + struct wait_queue_entry *wait;
> +
> + spin_lock(&mcg->oom_target.lock);
> + if (!waitqueue_active(&mcg->oom_target))
> + goto empty_from;
> + wait = list_first_entry(&mcg->oom_target.head,
> + wait_queue_entry_t, entry);
> + do {
> + struct list_head *list;
> +
> + if (wait->private == task)
> + __remove_wait_queue(&mcg->oom_target,
> + wait);
> + list = wait->entry.next;
> + if (list_is_last(list, &mcg->oom_target.head))
> + break;
> + wait = list_entry(list,
> + struct wait_queue_entry, entry);
> + } while (1);
> +empty_from:
> + spin_unlock(&mcg->oom_target.lock);
> + }
> +#endif
> if (!css_set_populated(from_cset))
> css_set_update_populated(from_cset, false);
> } else {
> @@ -797,6 +831,33 @@ static void css_set_move_task(struct task_struct *task,
> rcu_assign_pointer(task->cgroups, to_cset);
> list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
> &to_cset->tasks);
> +#ifdef CONFIG_MEMCG
> + /* dequeue from memcg->oom_target */
> + mcg = mem_cgroup_from_task(task);
> + if (mcg) {
> + struct wait_queue_entry *wait;
> +
> + spin_lock(&mcg->oom_target.lock);
> + if (!waitqueue_active(&mcg->oom_target))
> + goto empty_to;
> + wait = list_first_entry(&mcg->oom_target.head,
> + wait_queue_entry_t, entry);
> + do {
> + struct list_head *list;
> +
> + if (wait->private == task)
> + __add_wait_queue(&mcg->oom_target,
> + wait);
> + list = wait->entry.next;
> + if (list_is_last(list, &mcg->oom_target.head))
> + break;
> + wait = list_entry(list,
> + struct wait_queue_entry, entry);
> + } while (1);
> +empty_to:
> + spin_unlock(&mcg->oom_target.lock);
> + }
> +#endif
> }
> }
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f6cad39f35df..2788fbdae267 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -62,6 +62,7 @@
> #include <linux/random.h>
> #include <linux/rcuwait.h>
> #include <linux/compat.h>
> +#include <linux/eventpoll.h>
>
> #include <linux/uaccess.h>
> #include <asm/unistd.h>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 524a4cb9bbe2..e1eb049a85e6 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> case PR_GET_FP_MODE:
> error = GET_FP_MODE(me);
> break;
> + case PR_SET_IDLE:
> + if (!((arg2 == 0) || (arg2 == PR_IDLE_MODE_KILLME)))
> + return -EINVAL;
> + me->oom_target = arg2;
> + error = 0;
> + break;
> + case PR_GET_IDLE:
> + error = me->oom_target;
> + break;
> default:
> error = -EINVAL;
> break;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 661f046ad318..a4e3b93aeccd 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4300,6 +4300,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> memory_cgrp_subsys.broken_hierarchy = true;
> }
>
> + init_waitqueue_head(&memcg->oom_target);
> +
> /* The following stuff does not apply to the root */
> if (!parent) {
> root_mem_cgroup = memcg;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index dee0f75c3013..c5d8f5a716bc 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -41,6 +41,9 @@
> #include <linux/kthread.h>
> #include <linux/init.h>
> #include <linux/mmu_notifier.h>
> +#include <linux/eventpoll.h>
> +#include <linux/wait.h>
> +#include <linux/memcontrol.h>
>
> #include <asm/tlb.h>
> #include "internal.h"
> @@ -54,6 +57,23 @@ int sysctl_oom_dump_tasks = 1;
>
> DEFINE_MUTEX(oom_lock);
>
> +static DECLARE_WAIT_QUEUE_HEAD(oom_target);
> +
> +/* Clean up after a EPOLL_KILLME process quits.
> + * Called by kernel/exit.c.
> + */
> +void exit_oom_target(void)
> +{
> + DECLARE_WAITQUEUE(wait, current);
> +
> + remove_wait_queue(&oom_target, &wait);
> +}
> +
> +inline struct wait_queue_head *oom_target_get_wait()
> +{
> + return &oom_target;
> +}
> +
> #ifdef CONFIG_NUMA
> /**
> * has_intersects_mems_allowed() - check task eligiblity for kill
> @@ -994,6 +1014,18 @@ int unregister_oom_notifier(struct notifier_block *nb)
> }
> EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>
> +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
> +{
> + struct task_struct *ts = wait->private;
> +
> + /* We use SIGKILL instead of the oom killer
> + * so as to cleanly interrupt ep_poll()
> + */
> + pr_info("Killing pid %u from prctl(PR_SET_IDLE) death row.\n", ts->pid);
> + send_sig(SIGKILL, ts, 1);
> + return 0;
> +}
> +
> /**
> * out_of_memory - kill the "best" process when we run out of memory
> * @oc: pointer to struct oom_control
> @@ -1007,6 +1039,7 @@ bool out_of_memory(struct oom_control *oc)
> {
> unsigned long freed = 0;
> enum oom_constraint constraint = CONSTRAINT_NONE;
> + wait_queue_head_t *w;
>
> if (oom_killer_disabled)
> return false;
> @@ -1056,6 +1089,20 @@ bool out_of_memory(struct oom_control *oc)
> return true;
> }
>
> + /*
> + * Check death row for current memcg or global.
> + */
> +#ifdef CONFIG_MEMCG
> + if (is_memcg_oom(oc))
> + w = &oc->memcg->oom_target;
> + else
> +#endif
> + w = oom_target_get_wait();
> + if (waitqueue_active(w)) {
> + wake_up(w);
> + return true;
> + }
> +
> select_bad_process(oc);
> /* Found nothing?!?! Either we hang forever, or we panic. */
> if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> --
> 2.14.1
>