[PATCH RFC v3 2/4] pidfd: add CLONE_PIDFD_AUTOKILL

From: Christian Brauner

Date: Tue Feb 17 2026 - 17:38:19 EST


Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
lifetime to the pidfd returned from clone3(). When the last reference to
the struct file created by clone3() is closed the kernel sends SIGKILL
to the child. A pidfd obtained via pidfd_open() for the same process
does not keep the child alive and does not trigger autokill - only the
specific struct file from clone3() has this property.

This is useful for container runtimes, service managers, and sandboxed
subprocess execution - any scenario where the child must die if the
parent crashes or abandons the pidfd.

CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying
lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no
one to reap it would become a zombie). CLONE_THREAD is rejected because
autokill targets a process not a thread.

The clone3 pidfd is identified by storing a pointer to the struct file in
signal_struct.autokill_pidfd. The pidfs .release handler compares the
file being closed against this pointer and sends SIGKILL via
group_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only on match. Files
from pidfd_open() or open_by_handle_at() are distinct struct files and
will never match. dup()/fork() share the same struct file so they extend
the child's lifetime until the last reference drops.

Unlike pdeath_signal autokill isn't disarmed on exec and on credential
changes that cross privilege boundaries. It would defeat the purpose of
this whole endeavour.

Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
---
fs/pidfs.c | 16 ++++++++++++++++
include/linux/sched/signal.h | 3 +++
include/uapi/linux/sched.h | 1 +
kernel/fork.c | 16 ++++++++++++++--
4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 318253344b5c..b3891b2097eb 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -8,6 +8,8 @@
#include <linux/mount.h>
#include <linux/pid.h>
#include <linux/pidfs.h>
+#include <linux/sched/signal.h>
+#include <linux/signal.h>
#include <linux/pid_namespace.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
@@ -637,7 +639,21 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return open_namespace(ns_common);
}

+static int pidfs_file_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = inode->i_private;
+ struct task_struct *task;
+
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_TGID);
+ if (task && READ_ONCE(task->signal->autokill_pidfd) == file)
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
+
+ return 0;
+}
+
static const struct file_operations pidfs_file_operations = {
+ .release = pidfs_file_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index f842c86b806f..85a3de5c4030 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -134,6 +134,9 @@ struct signal_struct {
unsigned int has_child_subreaper:1;
unsigned int autoreap:1;

+ /* pidfd that triggers SIGKILL on close, or NULL */
+ const struct file *autokill_pidfd;
+
#ifdef CONFIG_POSIX_TIMERS

/* POSIX.1b Interval Timers */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 8a22ea640817..b1aea8a86e2f 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -37,6 +37,7 @@
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#define CLONE_AUTOREAP 0x400000000ULL /* Auto-reap child on exit. */
+#define CLONE_PIDFD_AUTOKILL 0x800000000ULL /* Kill child when clone pidfd closes. */

/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
diff --git a/kernel/fork.c b/kernel/fork.c
index bc27dc10c309..7bcdba54c9a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2035,6 +2035,15 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}

+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2470,8 +2479,11 @@ __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);

- if (pidfile)
+ if (pidfile) {
+ if (clone_flags & CLONE_PIDFD_AUTOKILL)
+ p->signal->autokill_pidfd = pidfile;
fd_install(pidfd, pidfile);
+ }

proc_fork_connector(p);
sched_post_fork(p);
@@ -2909,7 +2921,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
- CLONE_AUTOREAP))
+ CLONE_AUTOREAP | CLONE_PIDFD_AUTOKILL))
return false;

/*

--
2.47.3