Re: [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone

From: Sargun Dhillon
Date: Wed Nov 04 2020 - 06:21:40 EST


On Mon, Nov 02, 2020 at 09:37:04PM +0100, Jann Horn wrote:
> At the moment, the seccomp notifier API is hard to use without combining
> it with APIs like poll() or epoll(); if all target processes have gone
> away, the polling APIs will raise an error indication on the file
> descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.
>
> This usability problem was discovered when Michael Kerrisk wrote a
> manpage for this API.
>
> To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
> behave as follows:
>
> If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
> immediately, no matter whether a notification is available or not.
>
> If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
> notification is delivered to userspace or all users of the filter have
> gone away.
>
> To avoid subtle breakage from eventloop-style code that doesn't set
> O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
> wants blocking behavior, and if blocking-style code forgets to do so,
> that will be much more obvious than the breakage we'd get the other way
> around.
> This also means that UAPI breakage from this change should be limited to
> blocking users of the API, of which, to my knowledge, there are none so far
> (outside of in-tree sample and selftest code, which this patch adjusts - in
> particular the code in samples/ has to change a bunch).
>
> This should be backported because future userspace code might otherwise not
> work properly on old kernels.
>
> Cc: stable@xxxxxxxxxxxxxxx
> Reported-by: Michael Kerrisk <mtk.manpages@xxxxxxxxx>
> Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
> ---
> kernel/seccomp.c | 62 +++++++++++++------
> samples/seccomp/user-trap.c | 16 +++++
> tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
> 3 files changed, 79 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 8ad7a293255a..b3730740515f 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -43,6 +43,7 @@
> #include <linux/uaccess.h>
> #include <linux/anon_inodes.h>
> #include <linux/lockdep.h>
> +#include <linux/freezer.h>
>
> /*
> * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
> @@ -138,7 +139,6 @@ struct seccomp_kaddfd {
> * @notifications: A list of struct seccomp_knotif elements.
> */
> struct notification {
> - struct semaphore request;
> u64 next_id;
> struct list_head notifications;
> };
> @@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
> list_add(&n.list, &match->notif->notifications);
> INIT_LIST_HEAD(&n.addfd);
>
> - up(&match->notif->request);
> wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> mutex_unlock(&match->notify_lock);
>
> @@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 id)
>
>
> static long seccomp_notify_recv(struct seccomp_filter *filter,
> - void __user *buf)
> + void __user *buf, bool blocking)
> {
> struct seccomp_knotif *knotif = NULL, *cur;
> + DEFINE_WAIT(wait);
> struct seccomp_notif unotif;
> ssize_t ret;
>
> @@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>
> memset(&unotif, 0, sizeof(unotif));
>
> - ret = down_interruptible(&filter->notif->request);
> - if (ret < 0)
> - return ret;
> -
> mutex_lock(&filter->notify_lock);
> +
> +retry:
> list_for_each_entry(cur, &filter->notif->notifications, list) {
> if (cur->state == SECCOMP_NOTIFY_INIT) {
> knotif = cur;
> @@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
> }
> }
>
> - /*
> - * If we didn't find a notification, it could be that the task was
> - * interrupted by a fatal signal between the time we were woken and
> - * when we were able to acquire the rw lock.
> - */
> if (!knotif) {
> - ret = -ENOENT;
> - goto out;
> + if (!blocking) {
> + if (refcount_read(&filter->users) == 0)
> + ret = -ENOTCONN;
> + else
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + /* This has to happen before checking &filter->users. */
> + prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);

Isn't there a subtle race condition here, that if we are in blocking mode, and
we get a notification here, we wont be woken up? since we enter the wait queue
after checking to see if there are pending notifications, and in that period
the wake_up_poll could have been called?

I very well might be missing something interesting about the semantics of
freezable_schedule.

shouldn't it read something like:

retry:
mutex_lock(...);
ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
if (ret)
goto out;
list_for_each_entry(cur, &filter->notif->notifications, list) {
if (cur->state == SECCOMP_NOTIFY_INIT) {
knotif = cur;
break;
}
}

/* Maybe this should be before we iterate over the list of outstanding notifcations? */
if (refcount_read(&filter->users) == 0) {
ret = -ENOTCONN;
goto out;
}

if (!knotif) {
if (!blocking) {
ret = -ENOENT;
goto out;
}
mutex_unlock(...);
freezable_schedule();
goto retry;
}


out:
finish_wait(&filter->wqh, &wait);
mutex_unlock(...);
> +
> + /*
> + * If all users of the filter are gone, throw an error
> + * instead of pointlessly continuing to block.
> + */
> + if (refcount_read(&filter->users) == 0) {
> + finish_wait(&filter->wqh, &wait);
> + ret = -ENOTCONN;
> + goto out;
> + }
> +
> + /* No notifications - wait for one... */
> + mutex_unlock(&filter->notify_lock);
> + freezable_schedule();
> + finish_wait(&filter->wqh, &wait);
> +
> + /* ... and retry */
> + mutex_lock(&filter->notify_lock);
> + if (signal_pending(current)) {
> + ret = -EINTR;
> + goto out;
> + }
> + goto retry;
> }
>
> unotif.id = knotif->id;
> @@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
> */
> mutex_lock(&filter->notify_lock);
> knotif = find_notification(filter, unotif.id);
> - if (knotif) {
> + if (knotif)
> knotif->state = SECCOMP_NOTIFY_INIT;
> - up(&filter->notif->request);
> - }
> mutex_unlock(&filter->notify_lock);
> }
>
> @@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
> {
> struct seccomp_filter *filter = file->private_data;
> void __user *buf = (void __user *)arg;
> + bool blocking = !(file->f_flags & O_NONBLOCK);
>
> /* Fixed-size ioctls */
> switch (cmd) {
> case SECCOMP_IOCTL_NOTIF_RECV:
> - return seccomp_notify_recv(filter, buf);
> + return seccomp_notify_recv(filter, buf, blocking);
> case SECCOMP_IOCTL_NOTIF_SEND:
> return seccomp_notify_send(filter, buf);
> case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
> @@ -1483,12 +1506,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
> if (!filter->notif)
> goto out;
>
> - sema_init(&filter->notif->request, 0);
> filter->notif->next_id = get_random_u64();
> INIT_LIST_HEAD(&filter->notif->notifications);
>
> ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
> - filter, O_RDWR);
> + filter, O_RDWR|O_NONBLOCK);
> if (IS_ERR(ret))
> goto out_notif;
>
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> index 20291ec6489f..b9e666f15998 100644
> --- a/samples/seccomp/user-trap.c
> +++ b/samples/seccomp/user-trap.c
> @@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
> return ret;
> }
>
> +static void set_blocking(int fd)
> +{
> + int file_status_flags = fcntl(fd, F_GETFL);
> +
> + file_status_flags &= ~O_NONBLOCK;
> + if (fcntl(fd, F_SETFL, file_status_flags)) {
> + perror("F_SETFL");
> + exit(1);
> + }
> +}
> +
> int main(void)
> {
> int sk_pair[2], ret = 1, status, listener;
> @@ -274,6 +285,8 @@ int main(void)
> if (listener < 0)
> goto out_kill;
>
> + set_blocking(listener);
> +
> /*
> * Fork a task to handle the requests. This isn't strictly necessary,
> * but it makes the particular writing of this sample easier, since we
> @@ -307,6 +320,9 @@ int main(void)
> while (1) {
> memset(req, 0, sizes.seccomp_notif);
> if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
> + if (errno == ENOTCONN)
> + exit(0);
> +
> perror("ioctl recv");
> goto out_resp;
> }
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 4a180439ee9e..5318f9cb1aec 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
> #endif
> }
>
> +#define set_blocking(fd) ({ \
> + int file_status_flags; \
> + file_status_flags = fcntl(fd, F_GETFL); \
> + file_status_flags &= ~O_NONBLOCK; \
> + ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0); \
> +})
> +
> /* Have TH_LOG report actual location filecmp() is used. */
> #define filecmp(pid1, pid2, fd1, fd2) ({ \
> int _ret; \
> @@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> /*
> * Check that nothing bad happens when we kill the task in the middle
> * of a syscall.
> @@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> pid = fork();
> ASSERT_GE(pid, 0);
>
> @@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> pid = fork();
> ASSERT_GE(pid, 0);
>
> @@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> pid = fork();
> ASSERT_GE(pid, 0);
>
> @@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> pid = fork();
> ASSERT_GE(pid, 0);
>
> @@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> pid = fork();
> ASSERT_GE(pid, 0);
>
> @@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
> SECCOMP_FILTER_FLAG_NEW_LISTENER);
> ASSERT_GE(listener, 0);
>
> + set_blocking(listener);
> +
> pid = fork();
> ASSERT_GE(pid, 0);
>
>
> base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
> --
> 2.29.1.341.ge80a0c044ae-goog
>