[PATCH v4 3/6] seccomp: add kernel-installed pinned-memfd redirect

From: Cong Wang

Date: Fri Jun 26 2026 - 21:23:18 EST


From: Cong Wang <cwang@xxxxxxxxxxxxxx>

Add SECCOMP_IOCTL_NOTIF_SEND_REDIRECT, which resumes a trapped syscall
(like SECCOMP_USER_NOTIF_FLAG_CONTINUE) with selected argument registers
substituted to point into a pin installed by
SECCOMP_IOCTL_NOTIF_PIN_INSTALL. This closes the user-notification TOCTOU
for fork+execve sandboxes: the data the kernel acts on lives in an
immutable, supervisor-controlled sealed mapping rather than in memory a
CLONE_VM peer can rewrite after the check.

Installing immutable mappings into and rewriting the argument registers of
another task is security-sensitive, so the feature is gated behind a new
SECCOMP_FILTER_FLAG_REDIRECT that a listener must declare at creation time
(it requires SECCOMP_FILTER_FLAG_NEW_LISTENER). Both
SECCOMP_IOCTL_NOTIF_PIN_INSTALL and SECCOMP_IOCTL_NOTIF_SEND_REDIRECT
require it. At most one redirect-capable filter may exist in a task's
filter chain (-EBUSY otherwise), so a redirected syscall has a single,
unambiguous one-deep register fixup.

The supervisor supplies an args_mask (which arg registers to replace), a
ptr_mask (which of those are pointers) and replacement values. Each
pointer substitution is validated by seccomp_pin_check(), which re-derives
authorization from the live mapping: the access
[args[i], args[i] + ptr_len[i]) must lie within a single VM_SEALED,
read-only VMA still backed by the named memfd. The kernel keeps no
bookkeeping of its own; after an execve or exit the VMA is gone and
validation simply fails with -EFAULT.

The kernel saves the trapped task's original arg registers into a small
heap record, writes the substituted values via syscall_set_arguments(),
and queues a task_work that restores the originals at user-mode return,
preserving the caller-saved arg-register ABI invariant for callers that
expect register contents to survive across a syscall. The restore is
skipped after a successful execve, whose fresh register frame must not be
clobbered.

The task_work uses TWA_RESUME, not TWA_SIGNAL. TWA_SIGNAL sets
TIF_NOTIFY_SIGNAL, which would make signal_pending() true for the whole
redirected syscall (the work is queued before the target resumes), so an
interruptible syscall would bail out with -ERESTARTSYS before doing any
work, restart, re-trap and be redirected again -- a livelock. TWA_RESUME
does not feed signal_pending(), yet the restore still runs before signal
delivery and before any restart: get_signal() runs task_work_run() at its
top, before it dequeues a signal and before arch_do_signal_or_restart()
rewinds the instruction pointer. So on a restartable syscall
(-ERESTARTSYS, -ERESTARTNOINTR, -ERESTARTNOHAND) the original arguments
are back in pt_regs before the rewind; the syscall re-executes with the
original arguments, re-traps seccomp and is redirected again. The
supervisor therefore sees the syscall once per interruption and must
answer each notification the same way.

rt_sigreturn is refused (-EOPNOTSUPP): it restores the entire register
frame from the user signal stack, which the restore task_work would then
corrupt, and it takes no arguments to substitute anyway.

The kernel-side capability is identical to what the trapped task would
have done with its own (peer-uncorrupted) arguments. No per-syscall
kernel-mode entrypoints are added; the substituted syscall runs in the
trapped task's context against sealed pages whose contents are
supervisor-controlled.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang@xxxxxxxxxxxxxx>
---
include/linux/seccomp.h | 7 +-
include/uapi/linux/seccomp.h | 74 +++++++++-
kernel/seccomp.c | 254 +++++++++++++++++++++++++++++++++++
3 files changed, 333 insertions(+), 2 deletions(-)

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index a91d1fc8a2b8..5d53f8fce508 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -10,7 +10,8 @@
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
- SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+ SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV | \
+ SECCOMP_FILTER_FLAG_REDIRECT)

/* sizeof() the first published struct seccomp_notif_addfd */
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
@@ -21,6 +22,10 @@
#define SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER1 40 /* adds @offset */
#define SECCOMP_NOTIFY_PIN_INSTALL_SIZE_LATEST SECCOMP_NOTIFY_PIN_INSTALL_SIZE_VER1

+/* sizeof() the first published struct seccomp_notif_resp_redirect */
+#define SECCOMP_NOTIFY_RESP_REDIRECT_SIZE_VER0 120
+#define SECCOMP_NOTIFY_RESP_REDIRECT_SIZE_LATEST SECCOMP_NOTIFY_RESP_REDIRECT_SIZE_VER0
+
#ifdef CONFIG_SECCOMP

#include <linux/thread_info.h>
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index cc34188f8aeb..d6888691633c 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -25,6 +25,12 @@
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
/* Received notifications wait in killable state (only respond to fatal signals) */
#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
+/*
+ * Declares that this listener's notifier may issue
+ * SECCOMP_IOCTL_NOTIF_PIN_INSTALL / SECCOMP_IOCTL_NOTIF_SEND_REDIRECT. At most
+ * one such filter may exist in a task's filter chain. Requires NEW_LISTENER.
+ */
+#define SECCOMP_FILTER_FLAG_REDIRECT (1UL << 6)

/*
* All BPF programs must return a 32-bit value.
@@ -139,7 +145,9 @@ struct seccomp_notif_addfd {

/**
* struct seccomp_notif_pin_install - have the kernel install a sealed
- * MAP_SHARED mapping of @memfd into the trapped task's mm at @target_addr.
+ * MAP_SHARED mapping of @memfd into the trapped task's mm at @target_addr,
+ * which SECCOMP_IOCTL_NOTIF_SEND_REDIRECT can then use as a target for
+ * substituted pointer arguments.
*
* The supervisor owns @memfd. The kernel installs the mapping into
* the trapped task's address space without target-side cooperation
@@ -198,6 +206,61 @@ struct seccomp_notif_pin_install {

#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)

+/* Valid flags for struct seccomp_notif_resp_redirect. */
+#define SECCOMP_REDIRECT_FLAG_CONTINUE (1UL << 0)
+
+/*
+ * Number of syscall argument registers a redirect response may
+ * substitute (matches struct seccomp_data::args[]).
+ */
+#define SECCOMP_REDIRECT_ARGS 6
+
+/**
+ * struct seccomp_notif_resp_redirect - resume the trapped syscall with
+ * substituted arg-register values, optionally pointing into previously
+ * installed pinned-memfd regions.
+ *
+ * Like SECCOMP_USER_NOTIF_FLAG_CONTINUE the syscall actually runs, but the
+ * kernel first rewrites the arg registers selected by @args_mask. Each
+ * pointer substitution (@ptr_mask) is validated against the trapped task's
+ * current address space: the whole access [args[i], args[i] + ptr_len[i])
+ * must lie inside a single VM_SEALED, read-only mapping of @memfd. No per-pin
+ * bookkeeping is kept; authorization is re-derived from the live mapping, so
+ * a target that has exited or execve()d (its mapping gone) simply fails
+ * validation. Original registers are saved and restored at syscall exit for
+ * ABI compliance - except after a successful execve, whose new register file
+ * is left untouched (the redirect still applies, as execve copies the
+ * pathname from the immutable pin before the old mm is gone, closing that
+ * TOCTOU too).
+ *
+ * @id: The ID of the seccomp notification this response consumes.
+ * @flags: SECCOMP_REDIRECT_FLAG_*. CONTINUE must be set.
+ * @args_mask: Bit i set means args[i] replaces the trapped task's
+ * corresponding arg register before the syscall runs.
+ * @ptr_mask: Subset of @args_mask. Bit i set means args[i] is a pointer and
+ * the access [args[i], args[i] + ptr_len[i]) is validated to lie
+ * entirely inside a single VM_SEALED, read-only mapping of @memfd.
+ * Scalar replacements (in @args_mask but not @ptr_mask) are
+ * written verbatim.
+ * @memfd: Supervisor-side fd for the backing memfd whose sealed mapping the
+ * pointer substitutions must fall within. Consulted only when
+ * @ptr_mask is non-zero.
+ * @args: Replacement values for the arg registers.
+ * @ptr_len: For each bit set in @ptr_mask, ptr_len[i] is the byte length of
+ * the access starting at args[i]; it must be non-zero and args[i] +
+ * ptr_len[i] must not overflow. For every i whose bit is clear in
+ * @ptr_mask it must be 0.
+ */
+struct seccomp_notif_resp_redirect {
+ __u64 id;
+ __u32 flags;
+ __u32 args_mask;
+ __u32 ptr_mask;
+ __u32 memfd;
+ __u64 args[SECCOMP_REDIRECT_ARGS];
+ __u64 ptr_len[SECCOMP_REDIRECT_ARGS];
+};
+
/*
* Install a sealed memfd-backed pin in the trapped task's mm without
* target-side cooperation. The supervisor owns the backing memfd;
@@ -208,4 +271,13 @@ struct seccomp_notif_pin_install {
#define SECCOMP_IOCTL_NOTIF_PIN_INSTALL SECCOMP_IOWR(5, \
struct seccomp_notif_pin_install)

+/*
+ * Resume the trapped syscall with substituted arg-register values
+ * pointing into an installed pin. The kernel saves and restores the
+ * original registers at syscall exit so the caller observes ABI-
+ * correct register preservation.
+ */
+#define SECCOMP_IOCTL_NOTIF_SEND_REDIRECT SECCOMP_IOW(6, \
+ struct seccomp_notif_resp_redirect)
+
#endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index fa0fb3c960a8..7d499e422ba1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -233,6 +233,7 @@ struct seccomp_filter {
refcount_t users;
bool log;
bool wait_killable_recv;
+ bool redirect_capable;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
@@ -953,6 +954,13 @@ static long seccomp_attach_filter(unsigned int flags,
}
}

+ if (flags & SECCOMP_FILTER_FLAG_REDIRECT) {
+ for (walker = current->seccomp.filter; walker;
+ walker = walker->prev)
+ if (walker->redirect_capable)
+ return -EBUSY;
+ }
+
/* Set log flag, if present. */
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
@@ -961,6 +969,10 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
filter->wait_killable_recv = true;

+ /* Set redirect-capable flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_REDIRECT)
+ filter->redirect_capable = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -1937,6 +1949,237 @@ static long seccomp_notify_pin_install(struct seccomp_filter *filter,
return ret;
}

+static bool seccomp_pin_check(struct task_struct *target,
+ struct file *memfd_file, u64 ptr, u64 len)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ bool ok = false;
+ u64 end;
+
+ if (!len)
+ return false;
+ end = ptr + len;
+ if (end < ptr)
+ return false;
+
+ mm = get_task_mm(target);
+ if (!mm)
+ return false;
+
+ /*
+ * The access must lie in a single sealed, read-only, memfd-backed VMA.
+ * Read-only so no CLONE_VM peer can rewrite the bytes the kernel is
+ * about to read; VM_SEALED keeps the mapping itself immutable.
+ */
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, ptr);
+ if (vma && end <= vma->vm_end && (vma->vm_flags & VM_SEALED) &&
+ !(vma->vm_flags & VM_WRITE) &&
+ vma->vm_file && file_inode(vma->vm_file) == file_inode(memfd_file))
+ ok = true;
+ mmap_read_unlock(mm);
+
+ mmput(mm);
+ return ok;
+}
+
+struct seccomp_redirect_restore {
+ struct callback_head twork;
+ unsigned long orig_args[SECCOMP_REDIRECT_ARGS];
+ u32 args_mask; /* bit i: arg i was substituted, restore it */
+ u64 self_exec_id; /* snapshot to detect an intervening execve */
+};
+
+static void seccomp_redirect_restore_cb(struct callback_head *cb)
+{
+ struct seccomp_redirect_restore *r =
+ container_of(cb, struct seccomp_redirect_restore, twork);
+ unsigned long args[SECCOMP_REDIRECT_ARGS];
+ int i;
+
+ if (READ_ONCE(current->self_exec_id) != r->self_exec_id) {
+ kfree(r);
+ return;
+ }
+
+ syscall_get_arguments(current, current_pt_regs(), args);
+ for (i = 0; i < SECCOMP_REDIRECT_ARGS; i++)
+ if (r->args_mask & (1U << i))
+ args[i] = r->orig_args[i];
+ syscall_set_arguments(current, current_pt_regs(), args);
+ kfree(r);
+}
+
+/*
+ * rt_sigreturn restores the entire register frame from the user signal
+ * stack; the SEND_REDIRECT register-restore (run from task_work at user-mode
+ * return) would corrupt that frame, and the syscall takes no arguments to
+ * substitute anyway. Refuse to redirect it, including the compat variant.
+ */
+static bool seccomp_redirect_is_sigreturn(const struct seccomp_data *sd)
+{
+#ifdef SECCOMP_ARCH_COMPAT
+ if (sd->arch == SECCOMP_ARCH_COMPAT)
+ return sd->nr == __NR_seccomp_sigreturn_32;
+#endif
+ return sd->nr == __NR_seccomp_sigreturn;
+}
+
+static long seccomp_notify_send_redirect(struct seccomp_filter *filter,
+ struct seccomp_notif_resp_redirect __user *uresp,
+ unsigned int size)
+{
+ struct seccomp_notif_resp_redirect resp;
+ struct seccomp_knotif *knotif;
+ struct seccomp_redirect_restore *restore;
+ struct file *memfd_file = NULL;
+ struct pt_regs *target_regs;
+ unsigned long args[SECCOMP_REDIRECT_ARGS];
+ long ret;
+ int i;
+
+ BUILD_BUG_ON(sizeof(resp) < SECCOMP_NOTIFY_RESP_REDIRECT_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(resp) != SECCOMP_NOTIFY_RESP_REDIRECT_SIZE_LATEST);
+
+ if (!filter->redirect_capable)
+ return -EPERM;
+
+ if (size < SECCOMP_NOTIFY_RESP_REDIRECT_SIZE_VER0 || size >= PAGE_SIZE)
+ return -EINVAL;
+
+ ret = copy_struct_from_user(&resp, sizeof(resp), uresp, size);
+ if (ret)
+ return ret;
+
+ if (!(resp.flags & SECCOMP_REDIRECT_FLAG_CONTINUE))
+ return -EINVAL;
+ if (resp.flags & ~SECCOMP_REDIRECT_FLAG_CONTINUE)
+ return -EINVAL;
+ if (resp.args_mask & ~((1U << SECCOMP_REDIRECT_ARGS) - 1))
+ return -EINVAL;
+ if (resp.ptr_mask & ~resp.args_mask)
+ return -EINVAL;
+ if (!resp.args_mask)
+ return -EINVAL;
+
+ for (i = 0; i < SECCOMP_REDIRECT_ARGS; i++) {
+ if (resp.ptr_mask & (1U << i)) {
+ if (!resp.ptr_len[i])
+ return -EINVAL;
+ } else if (resp.ptr_len[i]) {
+ return -EINVAL;
+ }
+ }
+
+ restore = kzalloc_obj(*restore, GFP_KERNEL_ACCOUNT);
+ if (!restore)
+ return -ENOMEM;
+ init_task_work(&restore->twork, seccomp_redirect_restore_cb);
+
+ /* The backing memfd is only consulted to validate pointer args. */
+ if (resp.ptr_mask) {
+ memfd_file = fget(resp.memfd);
+ if (!memfd_file) {
+ kfree(restore);
+ return -EBADF;
+ }
+ }
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ goto out_free;
+
+ knotif = find_notification(filter, resp.id);
+ if (!knotif) {
+ ret = -ENOENT;
+ goto out_unlock_free;
+ }
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
+ ret = -EINPROGRESS;
+ goto out_unlock_free;
+ }
+
+ if (seccomp_redirect_is_sigreturn(knotif->data)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock_free;
+ }
+
+ for (i = 0; i < SECCOMP_REDIRECT_ARGS; i++) {
+ if (!(resp.ptr_mask & (1U << i)))
+ continue;
+ if (!seccomp_pin_check(knotif->task, memfd_file,
+ resp.args[i], resp.ptr_len[i])) {
+ ret = -EFAULT;
+ goto out_unlock_free;
+ }
+ }
+
+ /*
+ * Save original pt_regs args (target is parked in
+ * seccomp_do_user_notification, so its pt_regs is stable) and
+ * write substituted values. The trapped task's task_work fires
+ * at user-mode return, restoring originals for ABI compliance.
+ */
+ target_regs = task_pt_regs(knotif->task);
+ syscall_get_arguments(knotif->task, target_regs, args);
+ for (i = 0; i < SECCOMP_REDIRECT_ARGS; i++)
+ restore->orig_args[i] = args[i];
+ restore->args_mask = resp.args_mask;
+ restore->self_exec_id = READ_ONCE(knotif->task->self_exec_id);
+
+ for (i = 0; i < SECCOMP_REDIRECT_ARGS; i++)
+ if (resp.args_mask & (1U << i))
+ args[i] = resp.args[i];
+ syscall_set_arguments(knotif->task, target_regs, args);
+
+ /*
+ * Use TWA_RESUME, not TWA_SIGNAL. TWA_SIGNAL sets TIF_NOTIFY_SIGNAL,
+ * which makes signal_pending() true for the entire redirected syscall
+ * (the work is queued here, before the target resumes and runs it).
+ * An interruptible syscall would then bail out with -ERESTARTSYS before
+ * doing any work, restart, re-trap and get redirected again -- a
+ * livelock. TWA_RESUME does not feed signal_pending(), and the restore
+ * still runs before signal delivery: get_signal() runs task_work_run()
+ * before it dequeues a signal, so the original args are back in pt_regs
+ * before handle_signal() builds the sigframe or the -ERESTART* path
+ * rewinds for restart.
+ */
+ ret = task_work_add(knotif->task, &restore->twork, TWA_RESUME);
+ if (ret) {
+ for (i = 0; i < SECCOMP_REDIRECT_ARGS; i++)
+ args[i] = restore->orig_args[i];
+ syscall_set_arguments(knotif->task, target_regs, args);
+ goto out_unlock_free;
+ }
+
+ /*
+ * Mark REPLIED with FLAG_CONTINUE so the wait-loop exit path
+ * runs the syscall normally.
+ */
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
+ knotif->error = 0;
+ knotif->val = 0;
+ knotif->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ complete_on_current_cpu(&knotif->ready);
+ else
+ complete(&knotif->ready);
+
+ mutex_unlock(&filter->notify_lock);
+ if (memfd_file)
+ fput(memfd_file);
+ return 0;
+
+out_unlock_free:
+ mutex_unlock(&filter->notify_lock);
+out_free:
+ if (memfd_file)
+ fput(memfd_file);
+ kfree(restore);
+ return ret;
+}
+
static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -1964,6 +2207,9 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
case EA_IOCTL(SECCOMP_IOCTL_NOTIF_PIN_INSTALL):
return seccomp_notify_pin_install(filter, buf,
_IOC_SIZE(cmd));
+ case EA_IOCTL(SECCOMP_IOCTL_NOTIF_SEND_REDIRECT):
+ return seccomp_notify_send_redirect(filter, buf,
+ _IOC_SIZE(cmd));
default:
return -EINVAL;
}
@@ -2103,6 +2349,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
return -EINVAL;

+ /*
+ * SECCOMP_FILTER_FLAG_REDIRECT declares intent to redirect via the
+ * listener notifier, so it requires a listener.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_REDIRECT) &&
+ ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
--
2.43.0