[PATCH v6] hrtimer: avoid retrigger_next_event IPI

From: Marcelo Tosatti
Date: Mon Apr 19 2021 - 15:40:16 EST



Setting the realtime clock triggers an IPI to all CPUs to reprogram
the clock event device.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Instead of sending an IPI unconditionally, check each per CPU hrtimer base
whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If
that's not the case, update the realtime and TAI base offsets remotely and
skip the IPI. This ensures that any subsequently armed timers on
CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets.

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>

---

v6:
- Do not take softirq_raised into account (Peter Xu).
- Include BOOTTIME as base that requires IPI (Thomas).
- Unconditional reprogram on resume path, since there is
nothing to gain in such path anyway.

v5:
- Add missing hrtimer_update_base (Peter Xu).

v4:
- Drop unused code (Thomas).

v3:
- Nicer changelog (Thomas).
- Code style fixes (Thomas).
- Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas).
- Shrink preemption disabled section (Thomas).

v2:
- Only REALTIME and TAI bases are affected by offset-to-monotonic changes (Thomas).
- Don't special case nohz_full CPUs (Thomas).


diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bb5e7b0a4274..14a6e449b221 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -318,7 +318,7 @@ struct clock_event_device;

extern void hrtimer_interrupt(struct clock_event_device *dev);

-extern void clock_was_set_delayed(void);
+extern void clock_was_set_delayed(bool force_reprogram);

extern unsigned int hrtimer_resolution;

@@ -326,7 +326,7 @@ extern unsigned int hrtimer_resolution;

#define hrtimer_resolution (unsigned int)LOW_RES_NSEC

-static inline void clock_was_set_delayed(void) { }
+static inline void clock_was_set_delayed(bool force_reprogram) { }

#endif

@@ -351,7 +351,7 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
timer->base->get_time());
}

-extern void clock_was_set(void);
+extern void clock_was_set(bool);
#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
#else
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..2258782fd714 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,17 @@ static void hrtimer_switch_to_hres(void)
retrigger_next_event(NULL);
}

+static void clock_was_set_force_reprogram_work(struct work_struct *work)
+{
+ clock_was_set(true);
+}
+
+static DECLARE_WORK(hrtimer_force_reprogram_work, clock_was_set_force_reprogram_work);
+
+
static void clock_was_set_work(struct work_struct *work)
{
- clock_was_set();
+ clock_was_set(false);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);
@@ -769,9 +777,12 @@ static DECLARE_WORK(hrtimer_work, clock_was_set_work);
* Called from timekeeping and resume code to reprogram the hrtimer
* interrupt device on all cpus.
*/
-void clock_was_set_delayed(void)
+void clock_was_set_delayed(bool force_reprogram)
{
- schedule_work(&hrtimer_work);
+ if (force_reprogram)
+ schedule_work(&hrtimer_force_reprogram_work);
+ else
+ schedule_work(&hrtimer_work);
}

#else
@@ -871,6 +882,18 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
tick_program_event(expires, 1);
}

+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) | \
+ (1U << HRTIMER_BASE_REALTIME_SOFT) | \
+ (1U << HRTIMER_BASE_TAI) | \
+ (1U << HRTIMER_BASE_TAI_SOFT) | \
+ (1U << HRTIMER_BASE_BOOTTIME) | \
+ (1U << HRTIMER_BASE_BOOTTIME_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+ return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
+}
+
/*
* Clock realtime was set
*
@@ -882,11 +905,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
* resolution timer interrupts. On UP we just disable interrupts and
* call the high resolution interrupt code.
*/
-void clock_was_set(void)
+void clock_was_set(bool force_reprogram)
{
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
+ cpumask_var_t mask;
+ int cpu;
+
+ if (force_reprogram == true) {
+ on_each_cpu(retrigger_next_event, NULL, 1);
+ goto set_timerfd;
+ }
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ on_each_cpu(retrigger_next_event, NULL, 1);
+ goto set_timerfd;
+ }
+
+ /* Avoid interrupting CPUs if possible */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ unsigned long flags;
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ if (need_reprogram_timer(cpu_base))
+ cpumask_set_cpu(cpu, mask);
+ else
+ hrtimer_update_base(cpu_base);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ }
+
+ preempt_disable();
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+ preempt_enable();
+ cpus_read_unlock();
+ free_cpumask_var(mask);
+set_timerfd:
#endif
timerfd_clock_was_set();
}
@@ -903,7 +957,7 @@ void hrtimers_resume(void)
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
/* And schedule a retrigger for all others */
- clock_was_set_delayed();
+ clock_was_set_delayed(true);
}

/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..3fef237267bd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1323,7 +1323,7 @@ int do_settimeofday64(const struct timespec64 *ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

/* signal hrtimers about time change */
- clock_was_set();
+ clock_was_set(false);

if (!ret)
audit_tk_injoffset(ts_delta);
@@ -1371,7 +1371,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

/* signal hrtimers about time change */
- clock_was_set();
+ clock_was_set(false);

return ret;
}
@@ -1736,7 +1736,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

/* signal hrtimers about time change */
- clock_was_set();
+ clock_was_set(true);
}
#endif

@@ -2187,7 +2187,7 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (clock_set)
/* Have to call _delayed version, since in irq context*/
- clock_was_set_delayed();
+ clock_was_set_delayed(false);
}

/**
@@ -2425,7 +2425,7 @@ int do_adjtimex(struct __kernel_timex *txc)
timekeeping_advance(TK_ADV_FREQ);

if (tai != orig_tai)
- clock_was_set();
+ clock_was_set(false);

ntp_notify_cmos_timer();