[RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall)

From: Shawn Landden
Date: Wed Nov 01 2017 - 01:32:56 EST


It is common for services to be stateless around their main event loop.
If a process passes the EPOLL_KILLME flag to epoll_wait5() then it
signals to the kernel that epoll_wait5() may not complete, and the kernel
may send SIGKILL if resources get tight.

See my systemd patch: https://github.com/shawnl/systemd/tree/killme

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/eventpoll.c | 74 +++++++++++++++++++++++++++++++++-
include/linux/eventpoll.h | 2 +
include/linux/sched.h | 3 ++
include/uapi/asm-generic/unistd.h | 5 ++-
include/uapi/linux/eventpoll.h | 3 ++
kernel/exit.c | 2 +
mm/oom_kill.c | 17 ++++++++
9 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..040e5d02bdcc 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+385 i386 epoll_wait5 sys_epoll_wait5
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..c72802e8cf65 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 common epoll_wait5 sys_epoll_wait5

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..76d1c91d940b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -297,6 +297,14 @@ static LIST_HEAD(visited_list);
*/
static LIST_HEAD(tfile_check_list);

+static LIST_HEAD(deathrow_q);
+static long deathrow_len __read_mostly;
+
+/* TODO: Can this lock be removed by using atomic instructions to update
+ * queue?
+ */
+static DEFINE_MUTEX(deathrow_mutex);
+
#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>
@@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = {
.extra1 = &zero,
.extra2 = &long_max,
},
+ {
+ .procname = "deathrow_size",
+ .data = &deathrow_len,
+ .maxlen = sizeof(deathrow_len),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &long_max,
+ },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
+ *
+ * A flags argument cannot be added to epoll_pwait cause it already has
+ * the maximum number of arguments (6). Can this be fixed?
*/
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout)
+SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout, int, flags)
{
int error;
struct fd f;
@@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
*/
ep = f.file->private_data;

+ /* Check the EPOLL_* constants for conflicts. */
+ BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC);
+
+ if (flags & ~EPOLL_KILLME)
+ return -EINVAL;
+
+ if (flags & EPOLL_KILLME) {
+ /* Put process on death row. */
+ mutex_lock(&deathrow_mutex);
+ deathrow_len++;
+ list_add(&current->se.deathrow, &deathrow_q);
+ current->se.on_deathrow = 1;
+ mutex_unlock(&deathrow_mutex);
+ }
+
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);

+ if (flags & EPOLL_KILLME) {
+ /* Remove process from death row. */
+ mutex_lock(&deathrow_mutex);
+ current->se.on_deathrow = 0;
+ list_del(&current->se.deathrow);
+ deathrow_len--;
+ mutex_unlock(&deathrow_mutex);
+ }
+
error_fput:
fdput(f);
return error;
}

+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout)
+{
+ return sys_epoll_wait5(epfd, events, maxevents, timeout, 0);
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
@@ -2297,6 +2347,26 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
}
#endif

+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+int exit_killme(void)
+{
+ if (current->se.on_deathrow) {
+ mutex_lock(&deathrow_mutex);
+ current->se.on_deathrow = 0;
+ list_del(&current->se.deathrow);
+ mutex_unlock(&deathrow_mutex);
+ }
+
+ return 0;
+}
+
+struct list_head *eventpoll_deathrow_list(void)
+{
+ return &deathrow_q;
+}
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 2f14ac73d01d..f1e28d468de5 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -20,6 +20,8 @@
/* Forward declarations to avoid compiler errors */
struct file;

+int exit_killme(void);
+struct list_head *eventpoll_deathrow_list(void);

#ifdef CONFIG_EPOLL

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..66462bf27a29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -380,6 +380,9 @@ struct sched_entity {
struct list_head group_node;
unsigned int on_rq;

+ unsigned on_deathrow:1;
+ struct list_head deathrow;
+
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a5eb51..843553a39388 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -893,8 +893,11 @@ __SYSCALL(__NR_fork, sys_fork)
__SYSCALL(__NR_fork, sys_ni_syscall)
#endif /* CONFIG_MMU */

+#define __NR_epoll_wait5 1080
+__SYSCALL(__NR_epoll_wait5, sys_epoll_wait5)
+
#undef __NR_syscalls
-#define __NR_syscalls (__NR_fork+1)
+#define __NR_syscalls (__NR_fork+2)

#endif /* __ARCH_WANT_SYSCALL_DEPRECATED */

diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index f4d5c998cc2b..ce150a3e7248 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,9 @@
/* Flags for epoll_create1. */
#define EPOLL_CLOEXEC O_CLOEXEC

+/* Flags for epoll_wait5. */
+#define EPOLL_KILLME 0x00000001
+
/* Valid opcodes to issue to sys_epoll_ctl() */
#define EPOLL_CTL_ADD 1
#define EPOLL_CTL_DEL 2
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..cd089bdc5b17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
+#include <linux/eventpoll.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -917,6 +918,7 @@ void __noreturn do_exit(long code)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
exit_tasks_rcu_finish();
+ exit_killme();

lockdep_free_task(tsk);
do_task_dead();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..d6252772d593 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
#include <linux/kthread.h>
#include <linux/init.h>
#include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>

#include <asm/tlb.h>
#include "internal.h"
@@ -1029,6 +1030,22 @@ bool out_of_memory(struct oom_control *oc)
return true;
}

+ /*
+ * Check death row.
+ */
+ if (!list_empty(eventpoll_deathrow_list())) {
+ struct list_head *l = eventpoll_deathrow_list();
+ struct task_struct *ts = list_first_entry(l,
+ struct task_struct, se.deathrow);
+
+ pr_debug("Killing pid %u from EPOLL_KILLME death row.",
+ ts->pid);
+
+ /* We use SIGKILL so as to cleanly interrupt ep_poll() */
+ kill_pid(task_pid(ts), SIGKILL, 1);
+ return true;
+ }
+
/*
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
--
2.15.0.rc2