[patch 35/48] entry: Prepare for deferred hrtimer rearming

From: Thomas Gleixner

Date: Tue Feb 24 2026 - 11:51:16 EST


From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

The hrtimer interrupt expires timers and at the end of the interrupt it
rearms the clockevent device for the next expiring timer.

That's obviously correct, but in the case that a expired timer sets
NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is
enabled then schedule() will modify the hrtick timer, which causes another
reprogramming of the hardware.

That can be avoided by deferring the rearming to the return from interrupt
path and if the return results in a immediate schedule() invocation then it
can be deferred until the end of schedule(), which avoids multiple rearms
and re-evaluation of the timer wheel.

As this is only relevant for interrupt to user return split the work masks
up and hand them in as arguments from the relevant exit to user functions,
which allows the compiler to optimize the deferred handling out for the
syscall exit to user case.

Add the rearm checks to the approritate places in the exit to user loop and
the interrupt return to kernel path, so that the rearming is always
guaranteed.

In the return to user space path this is handled in the same way as
TIF_RSEQ to avoid extra instructions in the fast path, which are truly
hurtful for device interrupt heavy work loads as the extra instructions and
conditionals while benign at first sight accumulate quickly into measurable
regressions. The return from syscall path is completely unaffected due to
the above mentioned split so syscall heavy workloads wont have any extra
burden.

For now this is just placing empty stubs at the right places which are all
optimized out by the compiler until the actual functionality is in place.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
---
tglx: Split out to make it simpler to review and to make cross subsystem
merge logistics trivial.
---
include/linux/irq-entry-common.h | 25 +++++++++++++++++++------
include/linux/rseq_entry.h | 16 +++++++++++++---
kernel/entry/common.c | 4 +++-
3 files changed, 35 insertions(+), 10 deletions(-)

--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -3,6 +3,7 @@
#define __LINUX_IRQENTRYCOMMON_H

#include <linux/context_tracking.h>
+#include <linux/hrtimer_rearm.h>
#include <linux/kmsan.h>
#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
@@ -33,6 +34,14 @@
_TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \
ARCH_EXIT_TO_USER_MODE_WORK)

+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK)
+# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM)
+#else
+# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK)
+# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK)
+#endif
+
/**
* arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
* @regs: Pointer to currents pt_regs
@@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(str
/**
* __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
* @regs: Pointer to pt_regs on entry stack
+ * @work_mask: Which TIF bits need to be evaluated
*
* 1) check that interrupts are disabled
* 2) call tick_nohz_user_enter_prepare()
@@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(str
*
* Don't invoke directly, use the syscall/irqentry_ prefixed variants below
*/
-static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs,
+ const unsigned long work_mask)
{
unsigned long ti_work;

@@ -222,8 +233,10 @@ static __always_inline void __exit_to_us
tick_nohz_user_enter_prepare();

ti_work = read_thread_flags();
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
+ if (unlikely(ti_work & work_mask)) {
+ if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask))
+ ti_work = exit_to_user_mode_loop(regs, ti_work);
+ }

arch_exit_to_user_mode_prepare(regs, ti_work);
}
@@ -239,7 +252,7 @@ static __always_inline void __exit_to_us
/* Temporary workaround to keep ARM64 alive */
static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
{
- __exit_to_user_mode_prepare(regs);
+ __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK);
rseq_exit_to_user_mode_legacy();
__exit_to_user_mode_validate();
}
@@ -253,7 +266,7 @@ static __always_inline void exit_to_user
*/
static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
{
- __exit_to_user_mode_prepare(regs);
+ __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL);
rseq_syscall_exit_to_user_mode();
__exit_to_user_mode_validate();
}
@@ -267,7 +280,7 @@ static __always_inline void syscall_exit
*/
static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
{
- __exit_to_user_mode_prepare(regs);
+ __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ);
rseq_irqentry_exit_to_user_mode();
__exit_to_user_mode_validate();
}
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -40,6 +40,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_
#endif /* !CONFIG_RSEQ_STATS */

#ifdef CONFIG_RSEQ
+#include <linux/hrtimer_rearm.h>
#include <linux/jump_label.h>
#include <linux/rseq.h>
#include <linux/sched/signal.h>
@@ -110,7 +111,7 @@ static __always_inline void rseq_slice_c
t->rseq.slice.state.granted = false;
}

-static __always_inline bool rseq_grant_slice_extension(bool work_pending)
+static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
{
struct task_struct *curr = current;
struct rseq_slice_ctrl usr_ctrl;
@@ -215,11 +216,20 @@ static __always_inline bool rseq_grant_s
return false;
}

+static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask)
+{
+ if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) {
+ hrtimer_rearm_deferred_tif(ti_work);
+ return true;
+ }
+ return false;
+}
+
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline bool rseq_slice_extension_enabled(void) { return false; }
static inline bool rseq_arm_slice_extension_timer(void) { return false; }
static inline void rseq_slice_clear_grant(struct task_struct *t) { }
-static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
+static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */

bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -778,7 +788,7 @@ static inline void rseq_syscall_exit_to_
static inline void rseq_irqentry_exit_to_user_mode(void) { }
static inline void rseq_exit_to_user_mode_legacy(void) { }
static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
-static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
+static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
#endif /* !CONFIG_RSEQ */

#endif /* _LINUX_RSEQ_ENTRY_H */
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -50,7 +50,7 @@ static __always_inline unsigned long __e
local_irq_enable_exit_to_user(ti_work);

if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
- if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
+ if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY))
schedule();
}

@@ -225,6 +225,7 @@ noinstr void irqentry_exit(struct pt_reg
*/
if (state.exit_rcu) {
instrumentation_begin();
+ hrtimer_rearm_deferred();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
@@ -238,6 +239,7 @@ noinstr void irqentry_exit(struct pt_reg
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();

+ hrtimer_rearm_deferred();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();