[RFC PATCH] pidfs: ensure consistent ENOENT/ESRCH reporting

From: Christian Brauner
Date: Wed Apr 09 2025 - 14:19:33 EST


In a prior patch series we tried to cleanly differentiate between:

(1) The task has already been reaped.
(2) The caller requested a pidfd for a thread-group leader but the pid
actually references a struct pid that isn't used as a thread-group
leader.

as this was causing issues for non-threaded workloads.

But there's cases where the current simple logic is wrong. Specifically,
if the pid was a leader pid and the check races with __unhash_process().

Stabilize this by using a sequence counter associated with tasklist_lock
and retry while we're in __unhash_process(). The seqcounter might be
useful independent of pidfs.

Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
---
include/linux/pid.h | 1 +
kernel/exit.c | 11 +++++++++++
kernel/fork.c | 22 ++++++++++++----------
kernel/pid.c | 1 +
4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 311ecebd7d56..b54a4c1ef602 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -65,6 +65,7 @@ struct pid
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
+ seqcount_rwlock_t pid_seq;
struct rcu_head rcu;
struct upid numbers[];
};
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b51dc099f1e..8050572fe682 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,17 +133,28 @@ struct release_task_post {
static void __unhash_process(struct release_task_post *post, struct task_struct *p,
bool group_dead)
{
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
nr_threads--;
+
+ pid = task_pid(p);
+ raw_write_seqcount_begin(&pid->pid_seq);
detach_pid(post->pids, p, PIDTYPE_PID);
if (group_dead) {
detach_pid(post->pids, p, PIDTYPE_TGID);
detach_pid(post->pids, p, PIDTYPE_PGID);
detach_pid(post->pids, p, PIDTYPE_SID);
+ }
+ raw_write_seqcount_end(&pid->pid_seq);

+ if (group_dead) {
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
+
list_del_rcu(&p->thread_node);
}

diff --git a/kernel/fork.c b/kernel/fork.c
index 4a2080b968c8..1480bf6f5f38 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2109,24 +2109,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
int err = 0;
+ unsigned int seq;

- if (!(flags & PIDFD_THREAD)) {
+ do {
+ seq = raw_seqcount_begin(&pid->pid_seq);
/*
* If this is struct pid isn't used as a thread-group
* leader pid but the caller requested to create a
* thread-group leader pidfd then report ENOENT to the
* caller as a hint.
*/
- if (!pid_has_task(pid, PIDTYPE_TGID))
+ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
err = -ENOENT;
- }
-
- /*
- * If this wasn't a thread-group leader struct pid or the task
- * got reaped in the meantime report -ESRCH to userspace.
- */
- if (!pid_has_task(pid, PIDTYPE_PID))
- err = -ESRCH;
+ /*
+ * If this wasn't a thread-group leader struct pid or
+ * the task got reaped in the meantime report -ESRCH to
+ * userspace.
+ */
+ if (!pid_has_task(pid, PIDTYPE_PID))
+ err = -ESRCH;
+ } while (read_seqcount_retry(&pid->pid_seq, seq));
if (err)
return err;

diff --git a/kernel/pid.c b/kernel/pid.c
index 4ac2ce46817f..bbca61f62faa 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -271,6 +271,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
+ seqcount_rwlock_init(&pid->pid_seq, &tasklist_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
pidfs_add_pid(pid);
--
2.47.2