[PATCH] time: Prevent union confusion from unexpected restart_syscall()
From: Jann Horn
Date: Thu Jan 05 2023 - 08:44:44 EST
The nanosleep syscalls use the restart_block mechanism, with a quirk:
The `type` and `rmtp`/`compat_rmtp` fields are set up unconditionally on
syscall entry, while the rest of the restart_block is only set up in the
unlikely case that the syscall is actually interrupted by a signal (or
pseudo-signal) that doesn't have a signal handler.
If the restart_block was set up by a previous syscall
(futex(..., FUTEX_WAIT, ...) or poll()) and hasn't been invalidated
somehow since then, this will clobber some of the union fields used by
futex_wait_restart()/do_restart_poll().
If userspace afterwards wrongly calls the restart_syscall syscall,
futex_wait_restart()/do_restart_poll() will read struct fields that have
been clobbered.
This doesn't actually lead to anything particularly interesting because
none of the union fields contain trusted kernel data, and
futex(..., FUTEX_WAIT, ...) and poll() aren't syscalls where it makes much
sense to apply seccomp filters to their arguments.
So the current consequences are just of the "if userspace does bad stuff,
it can damage itself, and that's not a problem" flavor.
But still, it seems like a hazard for future developers, so invalidate the
restart_block when partly setting it up in the nanosleep syscalls.
Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
---
reproducer, demonstrates nanosleep() clobbering the upper half of
current->restart_block.poll.ufds (with TT_NATIVE==1) and
current->restart_block.poll.nfds (with 42):
user@vm:~/restart_syscall$ cat restart_syscall_union_confusion.c
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <stdio.h>
#include <poll.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/mman.h>
#include <sys/select.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
int main(void) {
int child = SYSCHK(fork());
if (child == 0) {
struct pollfd *pollfds = SYSCHK(mmap((void*)0x100000000, 0x1000,
PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE,
-1, 0));
int dev_null_fd = SYSCHK(open("/dev/null", O_WRONLY));
for (int i=0; i<100; i++)
pollfds[i] = (struct pollfd) { .fd = dev_null_fd, .events = POLLOUT };
errno = 0;
int res = poll(NULL, 0, 2000);
printf("poll = %d (%m)\n", res);
// this writes current->restart_block.nanosleep.{type,rmtp}
struct timespec ts_sleep = { .tv_nsec = 1000 };
SYSCHK(nanosleep(&ts_sleep, (void*)42UL));
errno = 0;
int ret = syscall(__NR_restart_syscall);
if (ret == -1 && errno == EINTR) {
printf("restart_syscall() returned EINTR, probably do_no_restart_syscall\n");
} else {
printf("restart_syscall() = %d (%m)\n", ret);
}
for (int i=0; i<50; i++)
printf("pollfds[%d].revents = 0x%x\n", i, pollfds[i].revents);
exit(0);
} else {
// parent
sleep(1);
printf("sending SIGSTOP\n");
kill(child, SIGSTOP);
sleep(1);
printf("sending SIGCONT\n");
kill(child, SIGCONT);
printf("waiting for child...\n");
int status;
SYSCHK(waitpid(child, &status, 0));
}
}
user@vm:~/restart_syscall$ gcc -o restart_syscall_union_confusion restart_syscall_union_confusion.c
user@vm:~/restart_syscall$ ./restart_syscall_union_confusion
sending SIGSTOP
sending SIGCONT
waiting for child...
poll = 0 (Success)
restart_syscall() = 42 (Success)
pollfds[0].revents = 0x4
pollfds[1].revents = 0x4
pollfds[2].revents = 0x4
pollfds[3].revents = 0x4
pollfds[4].revents = 0x4
pollfds[5].revents = 0x4
pollfds[6].revents = 0x4
pollfds[7].revents = 0x4
pollfds[8].revents = 0x4
pollfds[9].revents = 0x4
pollfds[10].revents = 0x4
pollfds[11].revents = 0x4
pollfds[12].revents = 0x4
pollfds[13].revents = 0x4
pollfds[14].revents = 0x4
pollfds[15].revents = 0x4
pollfds[16].revents = 0x4
pollfds[17].revents = 0x4
pollfds[18].revents = 0x4
pollfds[19].revents = 0x4
pollfds[20].revents = 0x4
pollfds[21].revents = 0x4
pollfds[22].revents = 0x4
pollfds[23].revents = 0x4
pollfds[24].revents = 0x4
pollfds[25].revents = 0x4
pollfds[26].revents = 0x4
pollfds[27].revents = 0x4
pollfds[28].revents = 0x4
pollfds[29].revents = 0x4
pollfds[30].revents = 0x4
pollfds[31].revents = 0x4
pollfds[32].revents = 0x4
pollfds[33].revents = 0x4
pollfds[34].revents = 0x4
pollfds[35].revents = 0x4
pollfds[36].revents = 0x4
pollfds[37].revents = 0x4
pollfds[38].revents = 0x4
pollfds[39].revents = 0x4
pollfds[40].revents = 0x4
pollfds[41].revents = 0x4
pollfds[42].revents = 0x0
pollfds[43].revents = 0x0
pollfds[44].revents = 0x0
pollfds[45].revents = 0x0
pollfds[46].revents = 0x0
pollfds[47].revents = 0x0
pollfds[48].revents = 0x0
pollfds[49].revents = 0x0
kernel/time/hrtimer.c | 2 ++
kernel/time/posix-stubs.c | 2 ++
kernel/time/posix-timers.c | 2 ++
3 files changed, 6 insertions(+)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e4f0e3b0c4f4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 90ea5f373e50..828aeecbd1e8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
texp = timespec64_to_ktime(t);
@@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
texp = timespec64_to_ktime(t);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 5dead89308b7..0c8a87a11b39 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
base-commit: 41c03ba9beea760bd2d2ac9250b09a2e192da2dc
--
2.39.0.314.g84b9a713c41-goog