Re: [PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()

From: Christian Brauner
Date: Thu Feb 01 2024 - 12:25:22 EST


On Wed, Jan 31, 2024 at 02:26:02PM +0100, Oleg Nesterov wrote:
> With this flag:
>
> - pidfd_open() doesn't require that the target task must be
> a thread-group leader
>
> - pidfd_poll() succeeds when the task exits and becomes a
> zombie (iow, passes exit_notify()), even if it is a leader
> and thread-group is not empty.
>
> This means that the behaviour of pidfd_poll(PIDFD_THREAD,
> pid-of-group-leader) is not well defined if it races with
> exec() from its sub-thread; pidfd_poll() can succeed or not
> depending on whether pidfd_task_exited() is called before
> or after exchange_tids().
>
> Perhaps we can improve this behaviour later, pidfd_poll()
> can probably take sig->group_exec_task into account. But
> this doesn't really differ from the case when the leader
> exits before other threads (so pidfd_poll() succeeds) and
> then another thread execs and pidfd_poll() will block again.
>
> thread_group_exited() is no longer used, perhaps it can die.
>
> Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
> Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>
> ---
> fs/exec.c | 6 +++++-
> include/linux/pid.h | 3 ++-
> include/uapi/linux/pidfd.h | 3 ++-
> kernel/exit.c | 7 +++++++
> kernel/fork.c | 38 +++++++++++++++++++++++++++++++-------
> kernel/pid.c | 14 +++-----------
> kernel/signal.c | 6 ++++--
> 7 files changed, 54 insertions(+), 23 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 73e4045df271..0fd7e668c477 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
>
> BUG_ON(leader->exit_state != EXIT_ZOMBIE);
> leader->exit_state = EXIT_DEAD;
> -
> + /*
> + * leader and tsk exhanged their pids, the old pid dies,
> + * wake up the PIDFD_THREAD waiters.
> + */
> + do_notify_pidfd(leader);
> /*
> * We are going to release_task()->ptrace_unlink() silently,
> * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index e6a041cb8bac..8124d57752b9 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
>
> struct file;
>
> -extern struct pid *pidfd_pid(const struct file *file);
> +struct pid *pidfd_pid(const struct file *file);
> struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
> struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
> int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
> +void do_notify_pidfd(struct task_struct *task);
>
> static inline struct pid *get_pid(struct pid *pid)
> {
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 5406fbc13074..2e6461459877 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -7,6 +7,7 @@
> #include <linux/fcntl.h>
>
> /* Flags for pidfd_open(). */
> -#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_THREAD O_EXCL
>
> #endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index dfb963d2f862..493647fd7c07 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> kill_orphaned_pgrp(tsk->group_leader, NULL);
>
> tsk->exit_state = EXIT_ZOMBIE;
> + /*
> + * sub-thread or delay_group_leader(), wake up the
> + * PIDFD_THREAD waiters.
> + */
> + if (!thread_group_empty(tsk))
> + do_notify_pidfd(tsk);
> +
> if (unlikely(tsk->ptrace)) {
> int sig = thread_group_leader(tsk) &&
> thread_group_empty(tsk) &&
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 347641398f9d..bea32071fff2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -101,6 +101,7 @@
> #include <linux/user_events.h>
> #include <linux/iommu.h>
> #include <linux/rseq.h>
> +#include <uapi/linux/pidfd.h>
>
> #include <asm/pgalloc.h>
> #include <linux/uaccess.h>
> @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>
> seq_put_decimal_ll(m, "Pid:\t", nr);
>
> + /* TODO: report PIDFD_THREAD */

One more thing that came to my mind. We also support waitid(P_PIDFD,
pidfd). And I just looked through the code and I think it does the right
thing when we pass it a PIDFD_THREAD pidfd because wait_consider_task()
rejects task->exit_signal != SIGCHLD in eligible_child() and
CLONE_THREAD implies task->exit_signal == -1.