[PATCH v4 4/6] seccomp: re-validate a redirected syscall against outer filters

From: Cong Wang

Date: Fri Jun 26 2026 - 21:23:44 EST


From: Cong Wang <cwang@xxxxxxxxxxxxxx>

Stacked seccomp filters are composed by seccomp_run_filters() taking the
min over a single evaluation of one seccomp_data: every filter votes on
the same registers, and the most restrictive verdict wins. That model
assumes the syscall the filters voted on is the syscall that runs.

SECCOMP_IOCTL_NOTIF_SEND_REDIRECT breaks the assumption. A USER_NOTIF
verdict only wins the min when every other (here: outer, earlier-applied)
filter returned something at least as permissive on the *original*
registers. The supervisor then rewrites the argument registers and the
syscall resumes via FLAG_CONTINUE without the stack being re-consulted.
So an inner, container-installed filter's notifier can redirect a syscall
whose substituted form an outer filter would have blocked.

Close the hole by resuming evaluation after a redirect instead of
re-running the min. A redirect turns one syscall into a different one, so
the outer filters cannot simply re-vote on a single seccomp_data; each
must judge the substituted syscall in turn, exactly as if the target had
issued it directly. Starting at the filter outer to the one that notified
(match->prev), seccomp_run_filters_seq() walks strictly toward the root,
evaluating one filter at a time and stopping at the first that does not
allow the call; ALLOW and LOG (which only logs) are non-blocking and
continue to the next filter. A BPF filter can only read registers, so
re-evaluating on the substituted registers is complete.

An outer filter may legitimately ERRNO/KILL/TRAP/TRACE or run its own
USER_NOTIF on the redirected syscall; if an outer notifier redirects
again, evaluation continues from its own ->prev. Walking strictly toward
the root is monotonic: the notifying filter and everything inner to it
are never reconsulted, so a notifier cannot loop by re-notifying on its
own redirect, and the walk is iterative (goto, not recursion) so a deep
stack of redirecting filters cannot exhaust the kernel stack.

Only a redirect triggers the continuation. The first pass is the
unchanged min over the full stack, and the allow-cache is consulted only
there, so nothing changes for existing, non-redirect users.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang@xxxxxxxxxxxxxx>
---
kernel/seccomp.c | 79 ++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7d499e422ba1..84812ce9bdb3 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -94,6 +94,13 @@ struct seccomp_knotif {
long val;
u32 flags;

+ /*
+ * Set by SEND_REDIRECT: the reply rewrote the syscall's registers,
+ * so on resume the syscall must be re-evaluated against the filters
+ * outer to the one that notified (see __seccomp_filter()).
+ */
+ bool redirect;
+
/*
* Signals when this has changed states, such as the listener
* dying, a new seccomp addfd message, or changing to REPLIED
@@ -1181,10 +1188,12 @@ static bool should_sleep_killable(struct seccomp_filter *match,

static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
- const struct seccomp_data *sd)
+ const struct seccomp_data *sd,
+ bool *redirected)
{
int err;
u32 flags = 0;
+ bool redirect = false;
long ret = 0;
struct seccomp_knotif n = {};
struct seccomp_kaddfd *addfd, *tmp;
@@ -1241,6 +1250,7 @@ static int seccomp_do_user_notification(int this_syscall,
ret = n.val;
err = n.error;
flags = n.flags;
+ redirect = n.redirect;

interrupted:
/* If there were any pending addfd calls, clear them out */
@@ -1267,14 +1277,38 @@ static int seccomp_do_user_notification(int this_syscall,
mutex_unlock(&match->notify_lock);

/* Userspace requests to continue the syscall. */
- if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) {
+ *redirected = redirect;
return 0;
+ }

syscall_set_return_value(current, current_pt_regs(),
err, ret);
return -1;
}

+static u32 seccomp_run_filters_seq(const struct seccomp_data *sd,
+ struct seccomp_filter **match,
+ struct seccomp_filter *f,
+ int this_syscall)
+{
+ for (; f; f = f->prev) {
+ u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
+ u32 action = cur_ret & SECCOMP_RET_ACTION_FULL;
+
+ if (action == SECCOMP_RET_ALLOW)
+ continue;
+ /* LOG does not block the syscall; record it and continue. */
+ if (action == SECCOMP_RET_LOG) {
+ seccomp_log(this_syscall, 0, action, true);
+ continue;
+ }
+ *match = f;
+ return cur_ret;
+ }
+ return SECCOMP_RET_ALLOW;
+}
+
static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
{
u32 filter_ret, action;
@@ -1291,6 +1325,8 @@ static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
populate_seccomp_data(&sd);

filter_ret = seccomp_run_filters(&sd, &match);
+
+eval:
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION_FULL;

@@ -1353,11 +1389,40 @@ static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)

return 0;

- case SECCOMP_RET_USER_NOTIF:
- if (seccomp_do_user_notification(this_syscall, match, &sd))
+ case SECCOMP_RET_USER_NOTIF: {
+ struct seccomp_filter *outer;
+ bool redirected = false;
+
+ if (seccomp_do_user_notification(this_syscall, match, &sd,
+ &redirected))
goto skip;

+ if (redirected && match->prev) {
+ /*
+ * The notifier rewrote the registers. Resume
+ * evaluation at the next outer filter on the
+ * substituted syscall, sequentially toward the root:
+ * each outer filter judges the new syscall exactly as
+ * if the target had issued it. Walking outward is
+ * monotonic, so a notifier cannot re-notify on its own
+ * redirect.
+ */
+ this_syscall = syscall_get_nr(current,
+ current_pt_regs());
+ if (this_syscall < 0)
+ return 0;
+ outer = match->prev;
+ match = NULL;
+ populate_seccomp_data(&sd);
+ filter_ret = seccomp_run_filters_seq(&sd, &match, outer,
+ this_syscall);
+ if (!match)
+ return 0;
+ goto eval;
+ }
+
return 0;
+ }

case SECCOMP_RET_LOG:
seccomp_log(this_syscall, 0, action, true);
@@ -2154,13 +2219,15 @@ static long seccomp_notify_send_redirect(struct seccomp_filter *filter,
}

/*
- * Mark REPLIED with FLAG_CONTINUE so the wait-loop exit path
- * runs the syscall normally.
+ * Mark REPLIED with FLAG_CONTINUE so the wait-loop exit path runs the
+ * syscall normally. Flag the redirect so the resume path re-validates
+ * the rewritten syscall against the filters outer to this one.
*/
knotif->state = SECCOMP_NOTIFY_REPLIED;
knotif->error = 0;
knotif->val = 0;
knotif->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ knotif->redirect = true;
if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
complete_on_current_cpu(&knotif->ready);
else
--
2.43.0