Re: [patch 35/48] entry: Prepare for deferred hrtimer rearming

From: Christian Loehle

Date: Fri Feb 27 2026 - 11:03:26 EST


On 2/24/26 16:38, Thomas Gleixner wrote:
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>
> The hrtimer interrupt expires timers and at the end of the interrupt it
> rearms the clockevent device for the next expiring timer.
>
> That's obviously correct, but in the case that a expired timer sets
> NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is
> enabled then schedule() will modify the hrtick timer, which causes another
> reprogramming of the hardware.
>
> That can be avoided by deferring the rearming to the return from interrupt
> path and if the return results in a immediate schedule() invocation then it
> can be deferred until the end of schedule(), which avoids multiple rearms
> and re-evaluation of the timer wheel.
>
> As this is only relevant for interrupt to user return split the work masks
> up and hand them in as arguments from the relevant exit to user functions,
> which allows the compiler to optimize the deferred handling out for the
> syscall exit to user case.
>
> Add the rearm checks to the approritate places in the exit to user loop and
> the interrupt return to kernel path, so that the rearming is always
> guaranteed.
>
> In the return to user space path this is handled in the same way as
> TIF_RSEQ to avoid extra instructions in the fast path, which are truly
> hurtful for device interrupt heavy work loads as the extra instructions and
> conditionals while benign at first sight accumulate quickly into measurable
> regressions. The return from syscall path is completely unaffected due to
> the above mentioned split so syscall heavy workloads wont have any extra
> burden.
>
> For now this is just placing empty stubs at the right places which are all
> optimized out by the compiler until the actual functionality is in place.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
> ---
> tglx: Split out to make it simpler to review and to make cross subsystem
> merge logistics trivial.
> ---
> include/linux/irq-entry-common.h | 25 +++++++++++++++++++------
> include/linux/rseq_entry.h | 16 +++++++++++++---
> kernel/entry/common.c | 4 +++-
> 3 files changed, 35 insertions(+), 10 deletions(-)
>
> --- a/include/linux/irq-entry-common.h
> +++ b/include/linux/irq-entry-common.h
> @@ -3,6 +3,7 @@
> #define __LINUX_IRQENTRYCOMMON_H
>
> #include <linux/context_tracking.h>
> +#include <linux/hrtimer_rearm.h>
> #include <linux/kmsan.h>
> #include <linux/rseq_entry.h>
> #include <linux/static_call_types.h>
> @@ -33,6 +34,14 @@
> _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> +#ifdef CONFIG_HRTIMER_REARM_DEFERRED
> +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK)
> +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM)
> +#else
> +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK)
> +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK)
> +#endif
> +
> /**
> * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
> * @regs: Pointer to currents pt_regs
> @@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(str
> /**
> * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
> * @regs: Pointer to pt_regs on entry stack
> + * @work_mask: Which TIF bits need to be evaluated
> *
> * 1) check that interrupts are disabled
> * 2) call tick_nohz_user_enter_prepare()
> @@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(str
> *
> * Don't invoke directly, use the syscall/irqentry_ prefixed variants below
> */
> -static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
> +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs,
> + const unsigned long work_mask)
> {
> unsigned long ti_work;
>
> @@ -222,8 +233,10 @@ static __always_inline void __exit_to_us
> tick_nohz_user_enter_prepare();
>
> ti_work = read_thread_flags();
> - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> - ti_work = exit_to_user_mode_loop(regs, ti_work);
> + if (unlikely(ti_work & work_mask)) {
> + if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask))
> + ti_work = exit_to_user_mode_loop(regs, ti_work);
> + }
>
> arch_exit_to_user_mode_prepare(regs, ti_work);
> }
> @@ -239,7 +252,7 @@ static __always_inline void __exit_to_us
> /* Temporary workaround to keep ARM64 alive */
> static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
> {
> - __exit_to_user_mode_prepare(regs);
> + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK);

Should this also be EXIT_TO_USER_MODE_WORK_IRQ?
I guess it doesn't really matter for now (since arm64 doesn't have the generic entry
path and generic TIF bits yet and therefore HRTIMER_REARM_DEFERRED=n), but I've been
playing around with the this series, the generic entry series
https://lore.kernel.org/lkml/20260203133728.848283-1-ruanjinjie@xxxxxxxxxx
(and using generic TIF bits) and noticed this.


> rseq_exit_to_user_mode_legacy();
> __exit_to_user_mode_validate();
> }
> @@ -253,7 +266,7 @@ static __always_inline void exit_to_user
> */
> static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - __exit_to_user_mode_prepare(regs);
> + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL);
> rseq_syscall_exit_to_user_mode();
> __exit_to_user_mode_validate();
> }
> @@ -267,7 +280,7 @@ static __always_inline void syscall_exit
> */
> static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - __exit_to_user_mode_prepare(regs);
> + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ);
> rseq_irqentry_exit_to_user_mode();
> __exit_to_user_mode_validate();
> [snip]