Re: [RFC] in-kernel rseq

From: Peter Zijlstra

Date: Tue Feb 24 2026 - 10:19:03 EST


On Tue, Feb 24, 2026 at 12:16:46PM +0100, Heiko Carstens wrote:
> On Mon, Feb 23, 2026 at 05:38:43PM +0100, Peter Zijlstra wrote:
> > This means, it needs to be woven into the asm... and I'm not that handy
> > with arm64 asm.
> >
> > The pseudo code would be something like:
> >
> > current->sched_seq = &_R;
> > ...
> >
> > _start: compute per cpu-addr
> > load addr
> > $OP
> > _commit: store addr
> >
> > ...
> > current->sched_rseq = NULL;
> >
> >
> > Then when preemption happens (from interrupt), the instruction pointer
> > is 'simply' reset to _start and it tries again.
>
> I guess also on every interrupt, exception, and nmi current->sched_rseq needs
> to be saved on entry, and restored on exit, since other contexts can make use
> of this_cpu ops as well.

Right -- so I can't seem to make my mind up on this. I *think* I like
the save/restore version of the sched version better.

Having it restart for every interrupt, even though its guaranteed to not
change the process seems unfortunate. Interrupts can be fairly high
rate without the task changing.

Anyway, I've cobbled together something a little more elaborate, but
equally untested.

I've renamed it kseq, to be distinct from the existing rseq, and there's
two versions, one sched and one irq based. The sched one is
saved/restored, while the irq one is not.

For both, the architecture is 'required' to provide a function/macro
that gives the address of the pointer, the sched one takes a task as
argument, but that could be completely ignored.

This allows you to use whatever storage you think best, lowcore on s390,
paca on Power, whatever.

Anyway, tglx will probably hate on all this for adding more crap :-)

---
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d26d1b1bcbfb..3f6d4ceaf3a1 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -362,9 +362,26 @@ typedef struct irqentry_state {
bool exit_rcu;
bool lockdep;
};
+#ifdef CONFIG_KSEQ_SCHED
+ void *kseq_sched;
+#endif
} irqentry_state_t;
#endif

+static __always_inline void irqentry_kseq_push(struct irqentry_state *state)
+{
+#ifdef CONFIG_KSEQ_SCHED
+ state->kseq_sched = *kseq_sched_ptr(current);
+#endif
+}
+
+static __always_inline void irqentry_kseq_pop(struct irqentry_state *state)
+{
+#ifdef CONFIG_KSEQ_SCHED
+ *kseq_sched_ptr(current) = state->kseq_sched;
+#endif
+}
+
/**
* irqentry_enter - Handle state tracking on ordinary interrupt entries
* @regs: Pointer to pt_regs of interrupted context
diff --git a/include/linux/kseq.h b/include/linux/kseq.h
new file mode 100644
index 000000000000..a8bfdbdedb6f
--- /dev/null
+++ b/include/linux/kseq.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KSEQ_H
+#define _LINUX_KSEQ_H
+
+#include <linux/ptrace.h>
+
+/*
+ * Kernel restartable SEQuence.
+ *
+ * Pseudo code; this is expected to be used in assembler:
+ *
+ * static const struct kseq _R = {
+ * .begin = &&__kseq_begin,
+ * .commit = &&__kseq_commit,
+ * .restart = &&__kseq_begin, // simply retry
+ * };
+ *
+ * __rseq_begin:
+ * WRITE_ONCE(->kseq, &_R); // section active
+ * addr = raw_cpu_ptr(pcp);
+ * v = READ_ONCE(*addr);
+ * v $OP i;
+ * __rseq_commit:
+ * WRITE_ONCE(*addr, v);
+ * WRITE_ONCE(->kseq, NULL); // section inactive
+ *
+ * NOTE: when .restart == begin, it must be before writing the relevant kseq
+ * pointer, since hitting the restart will clear the pointer.
+ *
+ * NOTE: commit must be the STORE that closes the sequence; being restarted
+ * after this could result in the operation being performed twice, which
+ * is ofcourse totally BAD(tm).
+ */
+struct kseq {
+ unsigned long begin;
+ unsigned long commit;
+ unsigned long restart;
+};
+
+static __always_inline void __restart_kernel_seq(struct kseq **kseq_ptr, struct pt_regs *regs)
+{
+ struct kseq *kseq = *kseq_ptr;
+ unsigned long ip;
+
+ if (!kseq)
+ return;
+
+ *kseq_ptr = NULL;
+
+ ip = instruction_pointer(regs);
+ if ((ip - kseq->begin) > (kseq->commit - kseq->begin))
+ return;
+
+ /*
+ * begin <= ip <= commit
+ */
+ instruction_pointer_set(regs, kseq->restart);
+}
+
+/*
+ * CONFIG_KSEQ_SCHED when set, shall provide:
+ * struct kseq **kseq_sched_ptr(struct task_struct *);
+ *
+ * CONFIG_KSEQ_IRQ when set, shall provide:
+ * struct kseq **kseq_irq_ptr(void);
+ *
+ * Both these functions shall provide an arch specific address for the
+ * respective kseq pointer.
+ */
+#if defined(CONFIG_KSEQ_SCHED) || defined(CONFIG_KSEQ_IRQ)
+#include <asm/kseq.h>
+#endif
+
+#endif /* _LINUX_KSEQ_H */
+
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9ef63e414791..376a7039152e 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -7,6 +7,7 @@
#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/tick.h>
+#include <linux/kseq.h>

/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -103,6 +104,13 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
}
}

+static __always_inline void irqentry_kseq(struct pt_regs *regs)
+{
+#ifdef CONFIG_KSEQ_IRQ
+ __restart_kernel_seq(kseq_irq_ptr(), regs);
+#endif
+}
+
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
irqentry_state_t ret = {
@@ -149,6 +157,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
+ irqentry_kseq_push(&ret);
+ irqentry_kseq(regs);
instrumentation_end();

ret.exit_rcu = true;
@@ -166,6 +176,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
+ irqentry_kseq_push(&ret);
+ irqentry_kseq(regs);
instrumentation_end();

return ret;
@@ -218,6 +230,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
if (user_mode(regs)) {
irqentry_exit_to_user_mode(regs);
} else if (!regs_irqs_disabled(regs)) {
+ irqentry_kseq_pop(&state);
/*
* If RCU was not watching on entry this needs to be done
* carefully and needs the same ordering of lockdep/tracing
@@ -242,6 +255,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
trace_hardirqs_on();
instrumentation_end();
} else {
+ irqentry_kseq_pop(&state);
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
@@ -266,6 +280,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
+ if (!user_mode(regs)) {
+ irqentry_kseq_push(&irq_state);
+ irqentry_kseq(regs);
+ }
instrumentation_end();

return irq_state;
@@ -274,6 +292,8 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
{
instrumentation_begin();
+ if (!user_mode(regs))
+ irqentry_kseq_pop(&irq_state);
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759777694c78..b51f41797fe0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -69,6 +69,7 @@
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
#include <linux/livepatch_sched.h>
+#include <linux/kseq.h>

#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_IRQ_ENTRY
@@ -5087,6 +5088,19 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
prepare_arch_switch(next);
}

+/*
+ * Must be called after context switch, but before finish_task(), which will
+ * allow wakeup and scheduling on another CPU.
+ *
+ * This ensures task_pt_regs() is filled out and stable.
+ */
+static inline void kseq_sched(struct task_struct *prev)
+{
+#ifdef CONFIG_KSEQ_SCHED
+ __restart_kernel_seq(kseq_sched_ptr(prev), task_pt_regs(prev));
+#endif
+}
+
/**
* finish_task_switch - clean up after a task-switch
* @prev: the thread we just switched away from.
@@ -5145,6 +5159,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ kseq_sched(prev);
finish_task(prev);
tick_nohz_task_switch();
finish_lock_switch(rq);