[PATCH v4 5/6] selftests/seccomp: cover non-cooperative pinned-memfd install

From: Cong Wang

Date: Fri Jun 26 2026 - 21:24:12 EST


From: Cong Wang <cwang@xxxxxxxxxxxxxx>

Add coverage for SECCOMP_IOCTL_NOTIF_PIN_INSTALL and
SECCOMP_IOCTL_NOTIF_SEND_REDIRECT, where the kernel installs a sealed
PROT_READ MAP_SHARED mapping of the supervisor's memfd into the trapped
task's mm via vm_mmap_seal_remote() and the supervisor redirects a
pointer argument into it.

Six tests:

- pinned_memfd_remote: a forked child that does no setup of its own
traps on openat; the supervisor PIN_INSTALLs and SEND_REDIRECTs the
path argument into the pin, and the child reads the
supervisor-controlled safe path from the sealed pages. Also covers
the rejection paths: an unsealed memfd (-EINVAL), a pointer one byte
past the pin, and a base inside the pin whose ptr_len extent runs
past its end (both -EFAULT).

- pinned_memfd_execve_scm: the full fork+execve flow with a detached
supervisor that receives the listener fd over SCM_RIGHTS. The child
traps pre-exec, then execve()s a real binary; the supervisor
reinstalls the pin into the freshly replaced mm and redirects again,
exercising the per-mm rebind (a pin from the old mm is no longer
accepted).

- pinned_memfd_churn: stateless validation across many short-lived
targets over one listener; PIN_INSTALL/SEND_REDIRECT keep working
with no per-target state accumulating (kmemleak/KASAN over the loop).

- redirect_outer_refilter: a redirect is re-checked against the outer
filters in the stack, so an inner notifier cannot smuggle a syscall
past an outer filter's policy.

- pinned_memfd_abi: the trapped task's original argument registers are
restored at syscall exit after a redirect.

- redirect_signal_abi: regression test that the deferred arg-register
restore runs before a signal frame is built, not after.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang@xxxxxxxxxxxxxx>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 1027 +++++++++++++++++
1 file changed, 1027 insertions(+)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 358b6c65e120..1b1eec505198 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -217,6 +217,10 @@ struct seccomp_metadata {
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif

+#ifndef SECCOMP_FILTER_FLAG_REDIRECT
+#define SECCOMP_FILTER_FLAG_REDIRECT (1UL << 6)
+#endif
+
#ifndef SECCOMP_RET_USER_NOTIF
#define SECCOMP_RET_USER_NOTIF 0x7fc00000U

@@ -295,6 +299,35 @@ struct seccomp_notif_addfd_big {
#define PTRACE_EVENTMSG_SYSCALL_EXIT 2
#endif

+#ifndef SECCOMP_IOCTL_NOTIF_PIN_INSTALL
+struct seccomp_notif_pin_install {
+ __u64 id;
+ __u32 flags;
+ __u32 memfd;
+ __u64 target_addr;
+ __u64 size;
+ __u64 offset;
+};
+#define SECCOMP_IOCTL_NOTIF_PIN_INSTALL SECCOMP_IOWR(5, \
+ struct seccomp_notif_pin_install)
+#endif
+
+#ifndef SECCOMP_IOCTL_NOTIF_SEND_REDIRECT
+#define SECCOMP_REDIRECT_FLAG_CONTINUE (1UL << 0)
+#define SECCOMP_REDIRECT_ARGS 6
+struct seccomp_notif_resp_redirect {
+ __u64 id;
+ __u32 flags;
+ __u32 args_mask;
+ __u32 ptr_mask;
+ __u32 memfd;
+ __u64 args[SECCOMP_REDIRECT_ARGS];
+ __u64 ptr_len[SECCOMP_REDIRECT_ARGS];
+};
+#define SECCOMP_IOCTL_NOTIF_SEND_REDIRECT SECCOMP_IOW(6, \
+ struct seccomp_notif_resp_redirect)
+#endif
+
#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
#endif
@@ -4368,6 +4401,1000 @@ TEST(user_notification_addfd_rlimit)
close(memfd);
}

+/*
+ * Create a write-sealed memfd of @size for PIN_INSTALL and map a supervisor
+ * writable view, primed with @content. F_SEAL_FUTURE_WRITE keeps this
+ * pre-seal mapping writable (so the test can still stage content) while
+ * barring any other writable reference, as PIN_INSTALL requires. Returns
+ * the memfd.
+ */
+static int make_pin_memfd(struct __test_metadata *_metadata, const char *name,
+ size_t size, char **sup_view, const char *content)
+{
+ int memfd = memfd_create(name, MFD_ALLOW_SEALING);
+
+ ASSERT_GE(memfd, 0);
+ ASSERT_EQ(0, ftruncate(memfd, size));
+ ASSERT_EQ(0, fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW));
+
+ *sup_view = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ memfd, 0);
+ ASSERT_NE(MAP_FAILED, *sup_view);
+ ASSERT_EQ(0, fcntl(memfd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE));
+ memcpy(*sup_view, content, strlen(content) + 1);
+ return memfd;
+}
+
+/*
+ * Non-cooperative pinned-memfd: kernel installs a sealed PROT_READ
+ * MAP_SHARED mapping of the supervisor's memfd directly into the
+ * trapped task's mm. The target runs no mmap or mseal code itself —
+ * this exercises the same kernel path that a fork+execve sandbox
+ * supervisor would use to install a pin in the new image's fresh
+ * post-exec mm.
+ *
+ * Target child does nothing but call openat() on a bait path. The
+ * supervisor catches the trap, calls PIN_INSTALL (kernel does the
+ * mmap + seal in target's mm via vm_mmap_seal_remote()), writes a
+ * safe path into its own memfd view, and SEND_REDIRECTs args[1]
+ * into the freshly installed pin. The child's openat resumes,
+ * reads from the sealed pin, and returns an fd to the safe path.
+ */
+TEST(user_notification_pinned_memfd_remote)
+{
+ pid_t pid;
+ long ret;
+ int status, listener, memfd, unsealed;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_pin_install pin = {};
+ struct seccomp_notif_pin_install unsealed_pin = {};
+ struct seccomp_notif_resp_redirect redir = {};
+ char *sup_view;
+ const size_t PIN_SIZE = 4096;
+ const char *safe_path = "/dev/null";
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ memfd = make_pin_memfd(_metadata, "pinned-remote", PIN_SIZE,
+ &sup_view, safe_path);
+
+ listener = user_notif_syscall(__NR_openat,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_REDIRECT);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+
+ /*
+ * Target performs no setup. Just trap on openat. Kernel
+ * (driven by the supervisor) will install the pin in this
+ * process's mm at a kernel-chosen address behind our back,
+ * and our openat will be redirected to read from there.
+ */
+ fd = syscall(__NR_openat, AT_FDCWD,
+ "/this/should/never/be/touched", O_RDONLY, 0);
+ if (fd < 0)
+ _exit(11);
+ _exit(0);
+ }
+
+ ASSERT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req));
+ EXPECT_EQ(req.data.nr, __NR_openat);
+
+ pin.id = req.id;
+ pin.memfd = memfd;
+ pin.target_addr = 0;
+ pin.size = PIN_SIZE;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_PIN_INSTALL, &pin)) {
+ if (errno == EINVAL) {
+ SKIP(goto cleanup,
+ "Kernel does not support pinned-memfd remote install");
+ }
+ TH_LOG("PIN_INSTALL failed: errno=%d", errno);
+ }
+
+ /* The kernel wrote a non-zero, page-aligned address back to us. */
+ EXPECT_NE(0, pin.target_addr);
+ EXPECT_EQ(0, pin.target_addr & (PIN_SIZE - 1));
+
+ /* Reject: the backing memfd must be write-sealed. */
+ unsealed = memfd_create("unsealed", MFD_ALLOW_SEALING);
+ ASSERT_GE(unsealed, 0);
+ ASSERT_EQ(0, ftruncate(unsealed, PIN_SIZE));
+ unsealed_pin.id = req.id;
+ unsealed_pin.memfd = unsealed;
+ unsealed_pin.size = PIN_SIZE;
+ EXPECT_EQ(-1, ioctl(listener, SECCOMP_IOCTL_NOTIF_PIN_INSTALL,
+ &unsealed_pin));
+ EXPECT_EQ(EINVAL, errno);
+ close(unsealed);
+
+ /* Reject: redirect outside any installed pin. */
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 1;
+ redir.ptr_mask = 1U << 1;
+ redir.memfd = memfd;
+ redir.ptr_len[1] = strlen(safe_path) + 1;
+ redir.args[1] = pin.target_addr + PIN_SIZE; /* one byte past */
+ EXPECT_EQ(-1, ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir));
+ EXPECT_EQ(EFAULT, errno);
+
+ /* Reject: base is inside the pin but the extent runs past its end. */
+ redir.args[1] = pin.target_addr;
+ redir.ptr_len[1] = PIN_SIZE + 1;
+ EXPECT_EQ(-1, ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir));
+ EXPECT_EQ(EFAULT, errno);
+
+ /* Happy path: redirect into the kernel-installed pin. */
+ redir.args[1] = pin.target_addr;
+ redir.ptr_len[1] = strlen(safe_path) + 1;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir));
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ TH_LOG("child exit %d (11=openat fail)", WEXITSTATUS(status));
+ }
+
+cleanup:
+ munmap(sup_view, PIN_SIZE);
+ close(memfd);
+ close(listener);
+}
+
+/*
+ * Helper for the execve test: read up to @max bytes of a NUL-terminated
+ * string from @pid's mm at @addr into @out. Returns the length read
+ * (excluding the NUL), or -1 on failure or no NUL.
+ */
+static ssize_t read_remote_string(pid_t pid, unsigned long addr,
+ char *out, size_t max)
+{
+ struct iovec local = { .iov_base = out, .iov_len = max };
+ struct iovec remote = { .iov_base = (void *)addr, .iov_len = max };
+ ssize_t n;
+ size_t i;
+
+ n = process_vm_readv(pid, &local, 1, &remote, 1, 0);
+ if (n <= 0)
+ return -1;
+ for (i = 0; i < (size_t)n; i++)
+ if (out[i] == '\0')
+ return (ssize_t)i;
+ return -1;
+}
+
+/*
+ * Send a file descriptor over a connected UNIX socket via SCM_RIGHTS.
+ * Used by the execve_scm test so the target child can hand its
+ * SECCOMP_FILTER_FLAG_NEW_LISTENER fd to the supervising parent
+ * without the parent having to inherit the seccomp filter itself.
+ */
+static int send_fd(int sock, int fd)
+{
+ char cbuf[CMSG_SPACE(sizeof(int))] = {};
+ char data = 'x';
+ struct iovec iov = { .iov_base = &data, .iov_len = 1 };
+ struct msghdr msg = {
+ .msg_iov = &iov, .msg_iovlen = 1,
+ .msg_control = cbuf, .msg_controllen = sizeof(cbuf),
+ };
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
+ return sendmsg(sock, &msg, 0) < 0 ? -1 : 0;
+}
+
+static int recv_fd(int sock)
+{
+ char cbuf[CMSG_SPACE(sizeof(int))] = {};
+ char data;
+ struct iovec iov = { .iov_base = &data, .iov_len = 1 };
+ struct msghdr msg = {
+ .msg_iov = &iov, .msg_iovlen = 1,
+ .msg_control = cbuf, .msg_controllen = sizeof(cbuf),
+ };
+ struct cmsghdr *cmsg;
+ int fd;
+
+ if (recvmsg(sock, &msg, 0) < 0)
+ return -1;
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS ||
+ cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -1;
+ memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
+ return fd;
+}
+
+struct addr_range {
+ unsigned long start, end;
+};
+
+/*
+ * Parse /proc/<pid>/maps looking for the dynamic linker's executable
+ * mapping (glibc ld-linux-*.so, musl ld-musl-*.so, etc.). The trapped
+ * task's instruction_pointer falling in this range identifies a
+ * loader-bootstrap syscall (race-free, kernel-truth) so the supervisor
+ * can auto-allow it without inspecting argument content via the racy
+ * process_vm_readv path.
+ *
+ * Requires the supervisor not to be subject to the seccomp filter
+ * itself -- fopen() internally calls openat(). The execve_scm test
+ * structure (child installs filter, sends listener fd to parent via
+ * SCM_RIGHTS) satisfies that.
+ *
+ * Returns 0 on success with @out populated, -1 if not found.
+ */
+static int find_loader_text_range(pid_t pid, struct addr_range *out)
+{
+ char maps_path[64];
+ char line[512];
+ FILE *f;
+ int found = 0;
+
+ snprintf(maps_path, sizeof(maps_path), "/proc/%d/maps", pid);
+ f = fopen(maps_path, "r");
+ if (!f)
+ return -1;
+
+ while (fgets(line, sizeof(line), f)) {
+ unsigned long start, end;
+ char perms[8];
+ char *path;
+
+ if (sscanf(line, "%lx-%lx %7s", &start, &end, perms) != 3)
+ continue;
+ if (!strchr(perms, 'x'))
+ continue;
+ path = strchr(line, '/');
+ if (!path)
+ continue;
+ /*
+ * Match common dynamic-linker basenames: ld-linux-*.so
+ * (glibc), ld-musl-*.so (musl), ld-*.so (older glibc).
+ */
+ if (strstr(path, "/ld-") || strstr(path, "/ld.so")) {
+ out->start = start;
+ out->end = end;
+ found = 1;
+ break;
+ }
+ }
+ fclose(f);
+ return found ? 0 : -1;
+}
+
+/*
+ * Non-cooperative pinned-memfd across a real execve, using the proper
+ * supervisor-isolation pattern: the child (target) installs the seccomp
+ * filter on itself and sends its listener fd to the parent (supervisor)
+ * via SCM_RIGHTS over a socketpair. The parent therefore does not carry
+ * the seccomp filter and can freely call openat() -- which is what makes
+ * the race-free, kernel-truth loader detection (req.data.instruction_pointer
+ * + /proc/<pid>/maps) actually usable.
+ *
+ * Phase 1: child does a pre-execve openat; the supervisor PIN_INSTALLs and
+ * SEND_REDIRECTs. Phase 2: child execve's, so the pre-execve pin VMA dies
+ * with the old mm. Phase 3: in the fresh post-execve mm the supervisor
+ * PIN_INSTALLs again (idempotent replace of the stale bookkeeping) and
+ * SEND_REDIRECTs, proving the full redirect mechanism survives an mm
+ * replacement, not just the install side.
+ */
+TEST(user_notification_pinned_memfd_execve_scm)
+{
+ pid_t pid;
+ int status, listener, memfd, sv[2];
+ struct seccomp_notif req = {};
+ struct seccomp_notif_pin_install pin = {};
+ struct seccomp_notif_resp_redirect redir = {};
+ struct seccomp_notif_resp cont_resp = {};
+ char *sup_view;
+ const size_t PIN_SIZE = 4096;
+ const char *safe_path = "/dev/null";
+ const char *bait = "/seccomp_pinned_memfd_test_bait_scm";
+ bool post_exec_install_ok = false;
+ bool post_exec_redirect_done = false;
+ bool loader_known = false;
+ bool loader_check_attempted = false;
+ struct addr_range loader_range = {};
+ int phase = 0;
+ int trap_count = 0;
+ const int trap_limit = 200;
+
+ if (access("/bin/cat", X_OK) != 0)
+ SKIP(return, "/bin/cat not present");
+
+ memfd = make_pin_memfd(_metadata, "pin-execve-scm", PIN_SIZE,
+ &sup_view, safe_path);
+
+ ASSERT_EQ(0, socketpair(AF_UNIX, SOCK_SEQPACKET, 0, sv));
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_openat,
+ 0, 1),
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ int my_listener;
+ int fd;
+
+ close(sv[0]);
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
+ _exit(20);
+ my_listener = seccomp(SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_REDIRECT,
+ &prog);
+ if (my_listener < 0)
+ _exit(21);
+ if (send_fd(sv[1], my_listener) < 0)
+ _exit(22);
+ close(my_listener);
+ close(sv[1]);
+
+ /* Pre-execve trap. */
+ fd = syscall(__NR_openat, AT_FDCWD,
+ "/this/should/never/be/touched", O_RDONLY, 0);
+ if (fd < 0)
+ _exit(11);
+
+ execl("/bin/cat", "cat", bait, (char *)NULL);
+ _exit(12);
+ }
+
+ close(sv[1]);
+ listener = recv_fd(sv[0]);
+ close(sv[0]);
+ ASSERT_GE(listener, 0);
+
+ /*
+ * Parent has the listener fd and does NOT have the seccomp
+ * filter. fopen(/proc/<pid>/maps) below works without
+ * deadlocking on the parent's own openat.
+ */
+ for (;;) {
+ struct pollfd pfd = { .fd = listener, .events = POLLIN };
+ int pret = poll(&pfd, 1, 500);
+ pid_t reaped;
+ bool ip_in_loader;
+
+ if (pret < 0)
+ break;
+ if (pret == 0 || !(pfd.revents & POLLIN)) {
+ reaped = waitpid(pid, &status, WNOHANG);
+ if (reaped == pid)
+ break;
+ if (pfd.revents & (POLLHUP | POLLERR))
+ break;
+ continue;
+ }
+
+ memset(&req, 0, sizeof(req));
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req) < 0) {
+ TH_LOG("NOTIF_RECV failed: errno=%d", errno);
+ break;
+ }
+ if (++trap_count > trap_limit) {
+ TH_LOG("trap_limit (%d) exceeded", trap_limit);
+ break;
+ }
+
+ if (phase == 0) {
+ pin.id = req.id;
+ pin.memfd = memfd;
+ pin.target_addr = 0;
+ pin.size = PIN_SIZE;
+ if (ioctl(listener,
+ SECCOMP_IOCTL_NOTIF_PIN_INSTALL,
+ &pin) != 0) {
+ TH_LOG("pre-exec PIN_INSTALL failed: errno=%d",
+ errno);
+ if (errno == EINVAL)
+ SKIP(goto cleanup_scm,
+ "Kernel lacks pinned-memfd remote");
+ goto cleanup_scm;
+ }
+
+ memset(&redir, 0, sizeof(redir));
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 1;
+ redir.ptr_mask = 1U << 1;
+ redir.memfd = memfd;
+ redir.ptr_len[1] = strlen(safe_path) + 1;
+ redir.args[1] = pin.target_addr;
+ if (ioctl(listener,
+ SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir) != 0) {
+ TH_LOG("pre-exec SEND_REDIRECT failed: errno=%d",
+ errno);
+ goto cleanup_scm;
+ }
+ phase = 1;
+ continue;
+ }
+
+ /*
+ * Post-execve. Lazily resolve the loader range. The
+ * supervisor's own openat (fopen on /proc/<pid>/maps)
+ * doesn't trap because the filter lives on the child,
+ * not on us.
+ */
+ if (!loader_known && !loader_check_attempted) {
+ if (find_loader_text_range(req.pid,
+ &loader_range) == 0)
+ loader_known = true;
+ loader_check_attempted = true;
+ }
+
+ ip_in_loader = loader_known &&
+ req.data.instruction_pointer >= loader_range.start &&
+ req.data.instruction_pointer < loader_range.end;
+
+ if (ip_in_loader) {
+ memset(&cont_resp, 0, sizeof(cont_resp));
+ cont_resp.id = req.id;
+ cont_resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &cont_resp);
+ continue;
+ }
+
+ /* Program code: inspect the path to identify the bait. */
+ {
+ char path[PATH_MAX];
+ ssize_t n;
+
+ n = read_remote_string(req.pid, req.data.args[1],
+ path, sizeof(path));
+ if (n < 0 || strcmp(path, bait) != 0) {
+ memset(&cont_resp, 0, sizeof(cont_resp));
+ cont_resp.id = req.id;
+ cont_resp.flags =
+ SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND,
+ &cont_resp);
+ continue;
+ }
+
+ pin.id = req.id;
+ pin.memfd = memfd;
+ pin.target_addr = 0;
+ pin.size = PIN_SIZE;
+ if (ioctl(listener,
+ SECCOMP_IOCTL_NOTIF_PIN_INSTALL,
+ &pin) == 0) {
+ post_exec_install_ok = true;
+ } else {
+ TH_LOG("post-exec PIN_INSTALL failed: errno=%d",
+ errno);
+ memset(&cont_resp, 0, sizeof(cont_resp));
+ cont_resp.id = req.id;
+ cont_resp.flags =
+ SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND,
+ &cont_resp);
+ continue;
+ }
+
+ memset(&redir, 0, sizeof(redir));
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 1;
+ redir.ptr_mask = 1U << 1;
+ redir.memfd = memfd;
+ redir.ptr_len[1] = strlen(safe_path) + 1;
+ redir.args[1] = pin.target_addr;
+ if (ioctl(listener,
+ SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir) == 0) {
+ post_exec_redirect_done = true;
+ } else {
+ TH_LOG("post-exec SEND_REDIRECT failed: errno=%d",
+ errno);
+ memset(&cont_resp, 0, sizeof(cont_resp));
+ cont_resp.id = req.id;
+ cont_resp.flags =
+ SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND,
+ &cont_resp);
+ }
+ }
+ }
+
+ if (waitpid(pid, &status, WNOHANG) == 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, &status, 0);
+ }
+ EXPECT_EQ(true, loader_known) {
+ TH_LOG("find_loader_text_range never resolved");
+ }
+ EXPECT_EQ(true, post_exec_install_ok);
+ EXPECT_EQ(true, post_exec_redirect_done);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+cleanup_scm:
+ munmap(sup_view, PIN_SIZE);
+ close(memfd);
+ close(listener);
+}
+
+/*
+ * Stateless redirect validation must hold up across many short-lived
+ * targets over one listener, and must not accumulate per-target state.
+ *
+ * PIN_INSTALL records nothing: the installed VM_SEALED VMA is the only
+ * record, and SEND_REDIRECT re-validates the pointer against the live
+ * mapping (sealed, read-only, backed by the supervisor's memfd inode).
+ * So a supervisor servicing a long churn of targets keeps working with
+ * no bookkeeping to leak. Each iteration lets the kernel choose the pin
+ * address in the fresh target mm; every install/redirect must succeed, and
+ * kmemleak/KASAN over the loop confirms nothing accumulates.
+ */
+TEST(user_notification_pinned_memfd_churn)
+{
+ const size_t PIN_SIZE = 4096;
+ const char *safe_path = "/dev/null";
+ const int iters = 16;
+ int listener, memfd, i;
+ char *sup_view;
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ memfd = make_pin_memfd(_metadata, "pinned-reap", PIN_SIZE,
+ &sup_view, safe_path);
+
+ listener = user_notif_syscall(__NR_openat,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_REDIRECT);
+ ASSERT_GE(listener, 0);
+
+ for (i = 0; i < iters; i++) {
+ struct seccomp_notif req = {};
+ struct seccomp_notif_pin_install pin = {};
+ struct seccomp_notif_resp_redirect redir = {};
+ int status;
+ pid_t pid;
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ int fd = syscall(__NR_openat, AT_FDCWD,
+ "/never/touched", O_RDONLY, 0);
+ _exit(fd < 0 ? 11 : 0);
+ }
+
+ ASSERT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req));
+ EXPECT_EQ(req.data.nr, __NR_openat);
+
+ pin.id = req.id;
+ pin.memfd = memfd;
+ pin.target_addr = 0;
+ pin.size = PIN_SIZE;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_PIN_INSTALL,
+ &pin)) {
+ if (errno == EINVAL) {
+ kill(pid, SIGKILL);
+ waitpid(pid, &status, 0);
+ SKIP(goto cleanup,
+ "Kernel lacks pinned-memfd remote install");
+ }
+ TH_LOG("iter %d PIN_INSTALL failed: errno=%d", i, errno);
+ }
+
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 1;
+ redir.ptr_mask = 1U << 1;
+ redir.memfd = memfd;
+ redir.ptr_len[1] = strlen(safe_path) + 1;
+ redir.args[1] = pin.target_addr;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir));
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ TH_LOG("iter %d child exit %d (11=openat fail)",
+ i, WEXITSTATUS(status));
+ }
+ /*
+ * Target is dead now; its pin (this iter's mm, at the
+ * kernel-chosen address) is stale. The next iteration's
+ * PIN_INSTALL walk must reap it rather than leak the range +
+ * mm + memfd reference.
+ */
+ }
+
+cleanup:
+ munmap(sup_view, PIN_SIZE);
+ close(memfd);
+ close(listener);
+}
+
+#ifdef __NR_socket
+/*
+ * A redirect must not let an inner (more recently installed) filter's
+ * notifier smuggle a syscall past an outer filter. Two filters are
+ * stacked on the target:
+ *
+ * outer (installed first): socket(AF_INET, ...) -> RET_ERRNO(EACCES),
+ * everything else ALLOW.
+ * inner (installed second): socket -> RET_USER_NOTIF.
+ *
+ * The child calls socket(AF_UNIX, ...), which the outer filter allows, so
+ * the inner notifier wins and fires. The supervisor SEND_REDIRECTs arg0
+ * to AF_INET. The kernel must then re-run the outer filter against the
+ * rewritten registers and block it with EACCES; without the outer-suffix
+ * re-validation the inner filter would have bypassed the outer policy.
+ */
+TEST(user_notification_redirect_outer_refilter)
+{
+ struct sock_filter outer_filter[] = {
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_socket, 0, 3),
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 0, 1),
+ BPF_STMT(BPF_RET | BPF_K,
+ SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)),
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog outer_prog = {
+ .len = (unsigned short)ARRAY_SIZE(outer_filter),
+ .filter = outer_filter,
+ };
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp_redirect redir = {};
+ int status, listener;
+ pid_t pid;
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Outer filter first => it becomes the outer/root of the stack. */
+ ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &outer_prog));
+
+ /* Inner USER_NOTIF filter second (innermost); returns the listener. */
+ listener = user_notif_syscall(__NR_socket,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_REDIRECT);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd = syscall(__NR_socket, AF_UNIX, SOCK_STREAM, 0);
+
+ if (fd >= 0)
+ _exit(12); /* bypass: outer filter was skipped */
+ if (errno != EACCES)
+ _exit(13); /* unexpected errno */
+ _exit(0);
+ }
+
+ ASSERT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req));
+ EXPECT_EQ(req.data.nr, __NR_socket);
+ EXPECT_EQ(req.data.args[0], AF_UNIX);
+
+ /* Scalar redirect of arg0 (no pin needed): AF_UNIX -> AF_INET. */
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 0;
+ redir.args[0] = AF_INET;
+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT, &redir);
+ if (ret < 0 && errno == EINVAL) {
+ kill(pid, SIGKILL);
+ waitpid(pid, &status, 0);
+ SKIP(return, "Kernel lacks SECCOMP_IOCTL_NOTIF_SEND_REDIRECT");
+ }
+ EXPECT_EQ(0, ret);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ switch (WEXITSTATUS(status)) {
+ case 12:
+ TH_LOG("child exit 12: redirect bypassed the outer filter");
+ break;
+ case 13:
+ TH_LOG("child exit 13: socket failed with unexpected errno");
+ break;
+ default:
+ TH_LOG("child exit %d (unexpected)", WEXITSTATUS(status));
+ }
+ }
+
+ close(listener);
+}
+#endif /* __NR_socket */
+
+#ifdef __x86_64__
+/*
+ * Load-bearing ABI check: after SEND_REDIRECT, the trapped task's
+ * redirected arg register must be restored to its original value
+ * before user-mode code resumes. The kernel's restore mechanism
+ * (task_work_add(TWA_SIGNAL) -> seccomp_redirect_restore_cb) is
+ * what guarantees this; without a test the property is just an
+ * assertion. Bypass libc's syscall() wrapper (which caller-saves
+ * arg values and would mask a restore bug) and capture the actual
+ * arg register immediately after the SYSCALL instruction.
+ *
+ * The child issues openat with RSI = sentinel_path. The supervisor
+ * SEND_REDIRECTs args[1] (RSI) to point into the pin. The kernel:
+ * - saves the original RSI into the knotif
+ * - writes the pin address into RSI via syscall_set_arguments()
+ * - runs the syscall (kernel reads path from the pin)
+ * - on syscall_exit_to_user_mode, fires task_work which calls
+ * syscall_set_arguments() again with the saved original
+ * - returns to user mode
+ *
+ * If task_work fires correctly, the child observes RSI == sentinel.
+ * If broken, RSI holds the pin address (the redirected value the
+ * kernel left in pt_regs).
+ */
+TEST(user_notification_pinned_memfd_abi)
+{
+ pid_t pid;
+ long ret;
+ int status, listener, memfd;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_pin_install pin = {};
+ struct seccomp_notif_resp_redirect redir = {};
+ char *sup_view;
+ const size_t PIN_SIZE = 4096;
+ const char *safe_path = "/dev/null";
+ /*
+ * The "sentinel" is a real string the child can also pass as
+ * the openat path. Its address is captured pre-syscall as RSI;
+ * post-syscall RSI must equal the same address.
+ */
+ static const char sentinel_path[] = "/seccomp_abi_sentinel";
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ memfd = make_pin_memfd(_metadata, "pin-abi", PIN_SIZE,
+ &sup_view, safe_path);
+
+ listener = user_notif_syscall(__NR_openat,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_REDIRECT);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ register long r10_val asm("r10") = 0;
+ unsigned long rsi_after;
+ long fd;
+
+ asm volatile(
+ "syscall\n\t"
+ "mov %%rsi, %[after]"
+ : "=a"(fd), [after] "=&r"(rsi_after)
+ : "0"((long)__NR_openat),
+ "D"((long)AT_FDCWD),
+ "S"((unsigned long)sentinel_path),
+ "d"((long)O_RDONLY),
+ "r"(r10_val)
+ : "rcx", "r11", "memory"
+ );
+
+ if (fd < 0)
+ _exit(11);
+ /*
+ * Load-bearing check: RSI immediately post-SYSCALL must
+ * still be the sentinel pointer the child passed in. The
+ * kernel's REDIRECT-then-restore mechanism is the only
+ * thing that guarantees this; a broken restore would leave
+ * the pin address in RSI.
+ */
+ if (rsi_after != (unsigned long)sentinel_path)
+ _exit(12);
+ _exit(0);
+ }
+
+ ASSERT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req));
+ EXPECT_EQ(req.data.nr, __NR_openat);
+ EXPECT_EQ(req.data.args[1], (unsigned long)sentinel_path);
+
+ pin.id = req.id;
+ pin.memfd = memfd;
+ pin.target_addr = 0;
+ pin.size = PIN_SIZE;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_PIN_INSTALL, &pin)) {
+ if (errno == EINVAL)
+ SKIP(goto cleanup,
+ "Kernel lacks pinned-memfd remote install");
+ }
+
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 1;
+ redir.ptr_mask = 1U << 1;
+ redir.memfd = memfd;
+ redir.ptr_len[1] = strlen(safe_path) + 1;
+ redir.args[1] = pin.target_addr;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir));
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ switch (WEXITSTATUS(status)) {
+ case 11:
+ TH_LOG("child exit 11: openat returned -errno");
+ break;
+ case 12:
+ TH_LOG("child exit 12: ABI violation -- RSI not restored after redirect");
+ break;
+ default:
+ TH_LOG("child exit %d (unexpected)", WEXITSTATUS(status));
+ }
+ }
+
+cleanup:
+ munmap(sup_view, PIN_SIZE);
+ close(memfd);
+ close(listener);
+}
+
+static void redir_sigusr1_handler(int signo)
+{
+ /* _exit() is async-signal-safe; bail with a distinct code if the
+ * signal frame was clobbered so the handler sees the wrong signo.
+ */
+ if (signo != SIGUSR1)
+ _exit(12);
+}
+
+/*
+ * Regression test: a redirect's deferred arg-register restore must run
+ * before a signal frame is built, not after.
+ *
+ * The restore was queued as a TWA_RESUME task_work, which runs in
+ * exit_to_user_mode_loop() *after* arch_do_signal_or_restart() has
+ * already set up the handler frame (regs->di = signo, regs->si =
+ * &siginfo, regs->dx = &ucontext). The restore then overwrote those
+ * registers with the trapped syscall's original argument values, so the
+ * handler was entered with a corrupted signal number. Queuing the
+ * restore with TWA_SIGNAL makes it run at the top of get_signal(),
+ * before the frame is built (and before any syscall-restart rewind).
+ *
+ * The child traps on pause(), the supervisor redirects arg0 (RDI), and
+ * then interrupts it with SIGUSR1. The handler must observe
+ * signo == SIGUSR1, not the leaked original RDI sentinel.
+ */
+TEST(user_notification_redirect_signal_abi)
+{
+ pid_t pid;
+ long ret;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp_redirect redir = {};
+ /* A recognizable original RDI the broken restore would leak in. */
+ const unsigned long RDI_SENTINEL = 0x5a5a5a5aUL;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_pause,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_REDIRECT);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct sigaction sa = {
+ .sa_handler = redir_sigusr1_handler,
+ };
+ long rc;
+
+ if (sigaction(SIGUSR1, &sa, NULL))
+ _exit(10);
+
+ /* Raw pause() carrying a controlled RDI sentinel. */
+ asm volatile(
+ "syscall"
+ : "=a"(rc)
+ : "0"((long)__NR_pause),
+ "D"(RDI_SENTINEL)
+ : "rcx", "r11", "memory");
+
+ /* pause() returns -EINTR once the handler has run. */
+ if (rc != -EINTR)
+ _exit(11);
+ _exit(0);
+ }
+
+ ASSERT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req));
+ EXPECT_EQ(req.data.nr, __NR_pause);
+ EXPECT_EQ(req.data.args[0], RDI_SENTINEL);
+
+ /* Redirect arg0 (non-pointer); this arms the original-RDI restore. */
+ redir.id = req.id;
+ redir.flags = SECCOMP_REDIRECT_FLAG_CONTINUE;
+ redir.args_mask = 1U << 0;
+ redir.args[0] = 0;
+ EXPECT_EQ(0, ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND_REDIRECT,
+ &redir)) {
+ if (errno == EINVAL) {
+ kill(pid, SIGKILL);
+ waitpid(pid, &status, 0);
+ SKIP(goto cleanup,
+ "Kernel lacks SECCOMP_IOCTL_NOTIF_SEND_REDIRECT");
+ }
+ }
+
+ /* Let the child reach pause(), then interrupt it with a handler. */
+ usleep(100000);
+ EXPECT_EQ(0, kill(pid, SIGUSR1));
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ switch (WEXITSTATUS(status)) {
+ case 10:
+ TH_LOG("child exit 10: sigaction failed");
+ break;
+ case 11:
+ TH_LOG("child exit 11: pause() did not return -EINTR");
+ break;
+ case 12:
+ TH_LOG("child exit 12: handler saw wrong signo (frame clobbered)");
+ break;
+ default:
+ TH_LOG("child exit %d (unexpected)", WEXITSTATUS(status));
+ }
+ }
+
+cleanup:
+ close(listener);
+}
+#endif /* __x86_64__ */
+
#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
--
2.43.0