io_uring: should IORING_TIMEOUT_ABS honour the submitter's time namespace?

From: Xie Maoyi

Date: Sat May 02 2026 - 05:21:23 EST

Hi all,

I think I have found what might be a bug in io_uring's absolute-deadline path on v7.0 mainline, and I would appreciate your confirmation on whether it is actually a bug and whether it is worth fixing.

When a process inside a CLONE_NEWTIME time namespace submits IORING_OP_TIMEOUT with IORING_TIMEOUT_ABS, the deadline is interpreted in host CLOCK_MONOTONIC instead of the caller's namespace view, so the timer can fire at the wrong moment.

A small reproducer (poc_iou_timens.c, attached) sets a -10 second CLOCK_MONOTONIC offset in a fresh time namespace and submits a "now + 1 second" absolute deadline. On vanilla v7.0 the CQE comes back in well under a millisecond instead of the expected ~1 second.

=== baseline (host time_ns) ===
[parent] elapsed=1000.478 ms, cqe res=-62
=== child (NEWTIME, monotonic offset -10s) ===
[child] elapsed=0.797 ms, cqe res=-62

The other absolute-deadline interfaces (timer_settime, clock_nanosleep with TIMER_ABSTIME, alarm_timer_nsleep with TIMER_ABSTIME, timerfd_settime with TFD_TIMER_ABSTIME) all run a user-supplied absolute timestamp through timens_ktime_to_host() before arming the hrtimer. io_uring/timeout.c does not, which is why I am bringing it up. CONFIG_TIME_NS landed in 5.6 and IORING_TIMEOUT_ABS predates it. I do not know whether this was a deliberate choice when CONFIG_TIME_NS landed or simply not considered at the time, so I would appreciate your view.

Could you let me know whether you consider this a bug worth fixing. If yes, I would be happy to send a patch and a SQPOLL follow-up in a separate thread.

I have only tested the non-SQPOLL synchronous io_uring_enter path on x86_64 with KASAN and lockdep enabled. I have a small patch that fixes the synchronous path and have re-run the same reproducer against it, where the child now sees ~1000 ms as expected.

Attachments:
poc_iou_timens.c -- C reproducer, raw io_uring syscalls
poc_post_patch.log -- reproducer output on the patched v7.0

Thanks for taking a look, and apologies in advance if this is already known or out of scope.

Best regards,
Maoyi
Nanyang Technological University
https://maoyixie.com/
________________________________

CONFIDENTIALITY: This email is intended solely for the person(s) named and may be confidential and/or privileged. If you are not the intended recipient, please delete it, notify us and do not copy, use, or disclose its contents.
Towards a sustainable earth: Print only when necessary. Thank you.

Attachment: poc_post_patch.log
Description: poc_post_patch.log

/* PoC for io_uring IORING_OP_TIMEOUT (IORING_TIMEOUT_ABS) ignoring time
* namespace offsets.
*
* Idea:
* 1. unshare(CLONE_NEWUSER | CLONE_NEWTIME) and set monotonic offset to
* a large positive value (so child sees CLOCK_MONOTONIC = host - offset).
* Then we exec into the new time_ns via /proc/self/ns/time_for_children
* after fork.
* 2. In the child (in time_ns), read CLOCK_MONOTONIC -> t_ns; submit
* IORING_OP_TIMEOUT with IORING_TIMEOUT_ABS, deadline = t_ns + 1s.
* Measure how long io_uring_enter blocks waiting for the CQE.
*
* Vanilla bug behaviour: io_uring computes hrtimer in host CLOCK_MONOTONIC
* view; child's t_ns + 1s is way in the past from host's POV (because
* the offset shifts time_ns CLOCK_MONOTONIC into the past). hrtimer
* fires immediately. Observed wait << 1s (often microseconds).
*
* Fixed kernel behaviour: io_uring converts t_ns + 1s through
* timens_ktime_to_host(); hrtimer fires ~1s after submit. Observed wait
* ~1s.
*
* Build: gcc poc_iou_timens.c -o poc_iou_timens
* Run: ./poc_iou_timens (must be unprivileged-userns capable kernel)
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <stdint.h>
#include <time.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/uio.h>
#include <linux/io_uring.h>

#ifndef CLONE_NEWTIME
#define CLONE_NEWTIME 0x00000080
#endif

/* From <linux/io_uring.h>: enum codes (verify they match the running kernel UAPI) */
#ifndef IORING_OP_TIMEOUT
#define IORING_OP_TIMEOUT 11
#endif
#ifndef IORING_TIMEOUT_ABS
#define IORING_TIMEOUT_ABS (1U << 0)
#endif

static int io_uring_setup(unsigned entries, struct io_uring_params *p)
{ return (int)syscall(__NR_io_uring_setup, entries, p); }
static int io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
unsigned flags, sigset_t *sig)
{ return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig, 0); }

struct ring {
int fd;
void *sq_ptr; size_t sq_size;
void *cq_ptr; size_t cq_size;
void *sqe_ptr; size_t sqe_size;
unsigned *sq_head, *sq_tail, *sq_mask, *sq_array;
unsigned *cq_head, *cq_tail, *cq_mask;
struct io_uring_sqe *sqes;
struct io_uring_cqe *cqes;
};

static int ring_setup(struct ring *r, unsigned entries)
{
struct io_uring_params p = {0};
r->fd = io_uring_setup(entries, &p);
if (r->fd < 0) { perror("io_uring_setup"); return -1; }

r->sq_size = p.sq_off.array + p.sq_entries * sizeof(unsigned);
r->cq_size = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
r->sqe_size = p.sq_entries * sizeof(struct io_uring_sqe);

r->sq_ptr = mmap(0, r->sq_size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
r->fd, IORING_OFF_SQ_RING);
r->cq_ptr = mmap(0, r->cq_size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
r->fd, IORING_OFF_CQ_RING);
r->sqe_ptr = mmap(0, r->sqe_size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
r->fd, IORING_OFF_SQES);
if (r->sq_ptr == MAP_FAILED || r->cq_ptr == MAP_FAILED || r->sqe_ptr == MAP_FAILED) {
perror("mmap"); return -1;
}
r->sq_head = r->sq_ptr + p.sq_off.head;
r->sq_tail = r->sq_ptr + p.sq_off.tail;
r->sq_mask = r->sq_ptr + p.sq_off.ring_mask;
r->sq_array = r->sq_ptr + p.sq_off.array;
r->cq_head = r->cq_ptr + p.cq_off.head;
r->cq_tail = r->cq_ptr + p.cq_off.tail;
r->cq_mask = r->cq_ptr + p.cq_off.ring_mask;
r->sqes = r->sqe_ptr;
r->cqes = r->cq_ptr + p.cq_off.cqes;
return 0;
}

static int submit_timeout_abs(struct ring *r, struct __kernel_timespec *deadline)
{
unsigned tail = *r->sq_tail;
unsigned idx = tail & *r->sq_mask;
struct io_uring_sqe *sqe = &r->sqes[idx];
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_TIMEOUT;
sqe->fd = -1;
sqe->addr = (uintptr_t)deadline;
sqe->len = 1;
sqe->off = 0;
sqe->timeout_flags = IORING_TIMEOUT_ABS;
sqe->user_data = 0xCAFEBABE;
r->sq_array[idx] = idx;
__atomic_store_n(r->sq_tail, tail + 1, __ATOMIC_RELEASE);
return io_uring_enter(r->fd, 1, 1, IORING_ENTER_GETEVENTS, NULL);
}

static long elapsed_ns(struct timespec *a, struct timespec *b)
{
return (b->tv_sec - a->tv_sec) * 1000000000L + (b->tv_nsec - a->tv_nsec);
}

static int run_in_timens(int do_unshare_time)
{
/* Read CLOCK_MONOTONIC in current namespace */
struct timespec t0;
clock_gettime(CLOCK_MONOTONIC, &t0);
char nsa[64]; int rl = readlink("/proc/self/ns/time", nsa, 63);
nsa[rl > 0 ? rl : 0] = 0;
fprintf(stderr, "[%s] netns-time=%s, CLOCK_MONOTONIC=%ld.%09ld\n",
do_unshare_time ? "child" : "parent",
nsa, (long)t0.tv_sec, t0.tv_nsec);

struct ring r = {0};
if (ring_setup(&r, 8) < 0) return 1;

/* Deadline = now + 1s, ABS in caller's time view */
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
struct __kernel_timespec d;
d.tv_sec = now.tv_sec + 1;
d.tv_nsec = now.tv_nsec;

struct timespec t_pre, t_post;
clock_gettime(CLOCK_MONOTONIC, &t_pre);
int rc = submit_timeout_abs(&r, &d);
clock_gettime(CLOCK_MONOTONIC, &t_post);
long ns = elapsed_ns(&t_pre, &t_post);

/* Check CQE */
unsigned head = *r.cq_head;
unsigned tail = __atomic_load_n(r.cq_tail, __ATOMIC_ACQUIRE);
int got_cqe = 0;
int cqe_res = 0;
if (tail != head) {
struct io_uring_cqe *cqe = &r.cqes[head & *r.cq_mask];
cqe_res = cqe->res;
got_cqe = 1;
__atomic_store_n(r.cq_head, head + 1, __ATOMIC_RELEASE);
}
fprintf(stderr,
"[%s] io_uring_enter rc=%d errno=%d, elapsed=%ld.%03ld ms, "
"cqe={present=%d,res=%d}\n",
do_unshare_time ? "child" : "parent",
rc, errno, ns/1000000, (ns/1000)%1000, got_cqe, cqe_res);

if (do_unshare_time) {
if (ns < 100*1000*1000L)
fprintf(stderr,
"[child] *** BUG: ABS deadline 1s in future fired in %ld ms â?? "
"io_uring is using HOST CLOCK_MONOTONIC, not the time_ns view ***\n",
ns/1000000);
else if (ns >= 800*1000*1000L && ns <= 1500*1000*1000L)
fprintf(stderr,
"[child] OK: deadline fired ~1s as expected (time_ns offset honoured)\n");
else
fprintf(stderr,
"[child] ??? unexpected timing %ld ms\n", ns/1000000);
}
munmap(r.sq_ptr, r.sq_size);
munmap(r.cq_ptr, r.cq_size);
munmap(r.sqe_ptr, r.sqe_size);
close(r.fd);
return 0;
}

int main(void)
{
/* First in current ns: sanity baseline (must take ~1s). */
fprintf(stderr, "=== baseline (host time_ns) ===\n");
run_in_timens(0);

/* Now create a time_ns with a non-zero monotonic offset and re-run. */
/* unshare CLONE_NEWUSER|CLONE_NEWTIME, write timens offsets via
* /proc/self/timens_offsets, then exec a child via fork that re-execs
* inheriting time_ns_for_children. */
if (unshare(CLONE_NEWUSER | CLONE_NEWTIME) < 0) {
perror("unshare(NEWUSER|NEWTIME)"); return 1;
}
int fd = open("/proc/self/setgroups", O_WRONLY);
if (fd >= 0) { write(fd, "deny", 4); close(fd); }
fd = open("/proc/self/uid_map", O_WRONLY);
if (fd >= 0) { write(fd, "0 0 1\n", 6); close(fd); }
fd = open("/proc/self/gid_map", O_WRONLY);
if (fd >= 0) { write(fd, "0 0 1\n", 6); close(fd); }

/* Set monotonic offset = -10 sec (shift child's MONOTONIC 10s into the
* past relative to host). Format: "<clkid> <secs> <nanos>\n".
* CLOCK_MONOTONIC=1, CLOCK_BOOTTIME=7. The kernel rejects offsets
* larger than current uptime; -10s is fine after a few seconds of boot. */
fd = open("/proc/self/timens_offsets", O_WRONLY);
if (fd < 0) { perror("open timens_offsets"); return 1; }
const char *off = "1 -10 0\n7 -10 0\n";
if (write(fd, off, strlen(off)) < 0) {
perror("write timens_offsets"); close(fd); return 1;
}
close(fd);

/* fork; child inherits time_ns_for_children. */
pid_t pid = fork();
if (pid == 0) {
fprintf(stderr, "\n=== child (NEWTIME, monotonic offset -10s) ===\n");
run_in_timens(1);
_exit(0);
}
int st; waitpid(pid, &st, 0);
return 0;
}