[PATCH 2/5] pidfd: add pidfd_wait()

From: Christian Brauner
Date: Wed Jul 24 2019 - 10:47:42 EST


This adds the pidfd_wait() syscall.

One of the last remaining bits for the pidfd api is to make it possible
to wait on pidfds. With this syscall implemented parts of userspace that
want to use this api can finally switch to managing processes completely
through pidfds if they so desire (cf. [1]).

The pidfd_wait() syscall does not allow scoping of the process
identified by the pidfd, i.e. it explicitly does not try to mirror the
behavior of: wait4(-1), wait4(0), waitid(P_ALL), waitid(P_PGID) etc. It
only allows for semantics equivalent to wait4(pid), waitid(P_PID). Users
that need scoping should rely on pid-based wait*() syscalls for now.

pidfd_wait() allows to specify which changes to wait for. The states to
wait for can be or-ed and are specified in the states argument:
WEXITED Wait for children that have terminated.
WSTOPPED Wait for children that have been stopped by
delivery of a signal.
WCONTINUED Wait for (previously stopped) children that have
been resumed by delivery of SIGCONT.
WUNTRACED Return if a child has stopped.

The behavior of pidfd_wait() can be further modified by specifying the
following or-able options in the flags argument:
__WCLONE Only wait for a process that delivers no signal
or a different signal than SIGCHLD to the parent
on termination.
__WALL Wait for all children indepedent of whether or
not they deliver no signal or another signal
than SIGCHLD to the parent on termination.
parent
__WNOTHREAD Do not wait for children of other threads in the
same thread-group.
WNOHANG Return immediately if no child has exited.
WNOWAIT Leave the child in a waitable state.

pidfd_wait() takes an additional siginfo_t argument. If it is non-NULL,
pidfd_wait() will fill in si_pid, si_uid, si_signo, si_status, and
si_code. The si_code field will be set to one of CLD_EXITED, CLD_KILLED,
CLD_DUMPED, CLD_STOPPED, CLD_TRAPPED, or CLD_CONTINUED.
Information about resource usage of the process in question is returned
in the struct rusage argument of pidfd_wait().

On success, pidfd_wait() will return the pid of the process the pidfd
referred to. On failure, a negative error code will be returned.

/* Prior approach */
The first implementation was based on a flag WPIDFD which got added to
the wait*() system calls. However, that involved passing the pidfd
through the pid_t pid argument and do in-kernel type switching based on
the flag which feels like a really unclean solution and overall like a
mishmash of two apis. This is something we luckily have avoided so far
and I think we're better off in the long run if we keep it that way.

/* References */
[1]: https://github.com/systemd/systemd/issues/13101

Signed-off-by: Christian Brauner <christian@xxxxxxxxxx>
Cc: Arnd Bergmann <arnd@xxxxxxxx>
Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: David Howells <dhowells@xxxxxxxxxx>
Cc: Jann Horn <jannh@xxxxxxxxxx>
Cc: Andy Lutomirsky <luto@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: Aleksa Sarai <cyphar@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: linux-api@xxxxxxxxxxxxxxx
---
include/linux/pid.h | 5 +++
kernel/exit.c | 87 +++++++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 8 +++++
kernel/signal.c | 7 ++--
4 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2a83e434db9d..443cd4108943 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -72,6 +72,11 @@ extern struct pid init_struct_pid;

extern const struct file_operations pidfd_fops;

+struct file;
+
+extern struct pid *pidfd_pid(const struct file *file);
+
+
static inline struct pid *get_pid(struct pid *pid)
{
if (pid)
diff --git a/kernel/exit.c b/kernel/exit.c
index 73392a455b72..8086c76e1959 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1738,3 +1738,90 @@ __weak void abort(void)
panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);
+
+static int copy_rusage_to_user_any(struct rusage *kru, struct rusage __user *ru)
+{
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ return put_compat_rusage(kru, (struct compat_rusage __user *)ru);
+#endif
+ return copy_to_user(ru, kru, sizeof(*kru));
+}
+
+static int copy_siginfo_to_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
+{
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ return copy_siginfo_to_user32(
+ (struct compat_siginfo __user *)info, kinfo);
+#endif
+ return copy_siginfo_to_user(info, kinfo);
+}
+
+SYSCALL_DEFINE6(pidfd_wait, int, pidfd, int __user *, stat_addr,
+ siginfo_t __user *, info, struct rusage __user *, ru,
+ unsigned int, states, unsigned int, flags)
+{
+ long ret;
+ struct fd f;
+ struct pid *pid;
+ struct wait_opts wo;
+ struct rusage kru = {};
+ kernel_siginfo_t kinfo = {
+ .si_signo = 0,
+ };
+
+ if (pidfd < 0)
+ return -EINVAL;
+
+ if (states & ~(WEXITED | WSTOPPED | WCONTINUED | WUNTRACED))
+ return -EINVAL;
+
+ if (!(states & (WEXITED | WSTOPPED | WCONTINUED | WUNTRACED)))
+ return -EINVAL;
+
+ if (flags & ~(__WNOTHREAD | __WCLONE | __WALL | WNOWAIT | WNOHANG))
+ return -EINVAL;
+
+ f = fdget(pidfd);
+ if (!f.file)
+ return -EBADF;
+
+ pid = pidfd_pid(f.file);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto out_fdput;
+ }
+
+ wo = (struct wait_opts){
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = pid,
+ .wo_flags = states | flags,
+ .wo_info = info ? &kinfo : NULL,
+ .wo_rusage = ru ? &kru : NULL,
+ };
+
+ ret = do_wait(&wo);
+ if (ret > 0) {
+ kinfo.si_signo = SIGCHLD;
+
+ if (stat_addr && put_user(wo.wo_stat, stat_addr)) {
+ ret = -EFAULT;
+ goto out_fdput;
+ }
+
+ if (ru && copy_rusage_to_user_any(&kru, ru)) {
+ ret = -EFAULT;
+ goto out_fdput;
+ }
+ } else {
+ kinfo.si_signo = 0;
+ }
+
+ if (info && copy_siginfo_to_user_any(&kinfo, info))
+ ret = -EFAULT;
+
+out_fdput:
+ fdput(f);
+ return ret;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index d8ae0f1b4148..baaff6570517 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1743,6 +1743,14 @@ const struct file_operations pidfd_fops = {
#endif
};

+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op == &pidfd_fops)
+ return file->private_data;
+
+ return ERR_PTR(-EBADF);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
diff --git a/kernel/signal.c b/kernel/signal.c
index 91b789dd6e72..2e567f64812f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3672,8 +3672,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)

static struct pid *pidfd_to_pid(const struct file *file)
{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
+ struct pid *pid;
+
+ pid = pidfd_pid(file);
+ if (!IS_ERR(pid))
+ return pid;

return tgid_pidfd_to_pid(file);
}
--
2.22.0