[tip: sched/hrtick] entry: Prepare for deferred hrtimer rearming
From: tip-bot2 for Peter Zijlstra
Date: Sat Feb 28 2026 - 10:40:11 EST
The following commit has been merged into the sched/hrtick branch of tip:
Commit-ID: 0e98eb14814ef669e07ca6effaa03df2e57ef956
Gitweb: https://git.kernel.org/tip/0e98eb14814ef669e07ca6effaa03df2e57ef956
Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
AuthorDate: Tue, 24 Feb 2026 17:38:03 +01:00
Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CommitterDate: Fri, 27 Feb 2026 16:40:13 +01:00
entry: Prepare for deferred hrtimer rearming
The hrtimer interrupt expires timers and at the end of the interrupt it
rearms the clockevent device for the next expiring timer.
That's obviously correct, but in the case that a expired timer sets
NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is
enabled then schedule() will modify the hrtick timer, which causes another
reprogramming of the hardware.
That can be avoided by deferring the rearming to the return from interrupt
path and if the return results in a immediate schedule() invocation then it
can be deferred until the end of schedule(), which avoids multiple rearms
and re-evaluation of the timer wheel.
As this is only relevant for interrupt to user return split the work masks
up and hand them in as arguments from the relevant exit to user functions,
which allows the compiler to optimize the deferred handling out for the
syscall exit to user case.
Add the rearm checks to the approritate places in the exit to user loop and
the interrupt return to kernel path, so that the rearming is always
guaranteed.
In the return to user space path this is handled in the same way as
TIF_RSEQ to avoid extra instructions in the fast path, which are truly
hurtful for device interrupt heavy work loads as the extra instructions and
conditionals while benign at first sight accumulate quickly into measurable
regressions. The return from syscall path is completely unaffected due to
the above mentioned split so syscall heavy workloads wont have any extra
burden.
For now this is just placing empty stubs at the right places which are all
optimized out by the compiler until the actual functionality is in place.
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Link: https://patch.msgid.link/20260224163431.066469985@xxxxxxxxxx
---
include/linux/irq-entry-common.h | 25 +++++++++++++++++++------
include/linux/rseq_entry.h | 16 +++++++++++++---
kernel/entry/common.c | 4 +++-
3 files changed, 35 insertions(+), 10 deletions(-)
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d26d1b1..b976946 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -3,6 +3,7 @@
#define __LINUX_IRQENTRYCOMMON_H
#include <linux/context_tracking.h>
+#include <linux/hrtimer_rearm.h>
#include <linux/kmsan.h>
#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
@@ -33,6 +34,14 @@
_TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \
ARCH_EXIT_TO_USER_MODE_WORK)
+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK)
+# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM)
+#else
+# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK)
+# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK)
+#endif
+
/**
* arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
* @regs: Pointer to currents pt_regs
@@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work
/**
* __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
* @regs: Pointer to pt_regs on entry stack
+ * @work_mask: Which TIF bits need to be evaluated
*
* 1) check that interrupts are disabled
* 2) call tick_nohz_user_enter_prepare()
@@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work
*
* Don't invoke directly, use the syscall/irqentry_ prefixed variants below
*/
-static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs,
+ const unsigned long work_mask)
{
unsigned long ti_work;
@@ -222,8 +233,10 @@ static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
tick_nohz_user_enter_prepare();
ti_work = read_thread_flags();
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
+ if (unlikely(ti_work & work_mask)) {
+ if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask))
+ ti_work = exit_to_user_mode_loop(regs, ti_work);
+ }
arch_exit_to_user_mode_prepare(regs, ti_work);
}
@@ -239,7 +252,7 @@ static __always_inline void __exit_to_user_mode_validate(void)
/* Temporary workaround to keep ARM64 alive */
static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
{
- __exit_to_user_mode_prepare(regs);
+ __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK);
rseq_exit_to_user_mode_legacy();
__exit_to_user_mode_validate();
}
@@ -253,7 +266,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg
*/
static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
{
- __exit_to_user_mode_prepare(regs);
+ __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL);
rseq_syscall_exit_to_user_mode();
__exit_to_user_mode_validate();
}
@@ -267,7 +280,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re
*/
static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
{
- __exit_to_user_mode_prepare(regs);
+ __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ);
rseq_irqentry_exit_to_user_mode();
__exit_to_user_mode_validate();
}
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index cbc4a79..17956e1 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -40,6 +40,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
#endif /* !CONFIG_RSEQ_STATS */
#ifdef CONFIG_RSEQ
+#include <linux/hrtimer_rearm.h>
#include <linux/jump_label.h>
#include <linux/rseq.h>
#include <linux/sched/signal.h>
@@ -110,7 +111,7 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
t->rseq.slice.state.granted = false;
}
-static __always_inline bool rseq_grant_slice_extension(bool work_pending)
+static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
{
struct task_struct *curr = current;
struct rseq_slice_ctrl usr_ctrl;
@@ -215,11 +216,20 @@ efault:
return false;
}
+static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask)
+{
+ if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) {
+ hrtimer_rearm_deferred_tif(ti_work);
+ return true;
+ }
+ return false;
+}
+
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline bool rseq_slice_extension_enabled(void) { return false; }
static inline bool rseq_arm_slice_extension_timer(void) { return false; }
static inline void rseq_slice_clear_grant(struct task_struct *t) { }
-static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
+static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -778,7 +788,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { }
static inline void rseq_irqentry_exit_to_user_mode(void) { }
static inline void rseq_exit_to_user_mode_legacy(void) { }
static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
-static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
+static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
#endif /* !CONFIG_RSEQ */
#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9ef63e4..9e1a6af 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -50,7 +50,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
local_irq_enable_exit_to_user(ti_work);
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
- if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
+ if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY))
schedule();
}
@@ -225,6 +225,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
*/
if (state.exit_rcu) {
instrumentation_begin();
+ hrtimer_rearm_deferred();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
@@ -238,6 +239,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
+ hrtimer_rearm_deferred();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();