[PATCH v1 3/7] sched/isolation: Adjust affinity of hrtimers according to change of housekeeping cpumask

From: Costa Shulyupin
Date: Thu May 16 2024 - 15:13:49 EST


Adjust affinity of watchdog_cpumask, hrtimers according to
change of housekeeping.cpumasks[HK_TYPE_TIMER].

Function migrate_hrtimer_list_except() is prototyped from
migrate_hrtimer_list() and is more generic.

Potentially it can be used instead of migrate_hrtimer_list.

Function hrtimers_resettle_from_cpu() is blindly prototyped
from hrtimers_cpu_dying(). local_irq_disable() is used because
cpuhp_thread_fun() uses it before cpuhp_invoke_callback().

Core test snippets without infrastructure:

1. Create hrtimer on specific cpu with:

set_cpus_allowed_ptr(current, cpumask_of(test_cpu));
hrtimer_init(&test_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
test_hrtimer.function = test_hrtimer_cb;
hrtimer_start(&test_hrtimer, -1, HRTIMER_MODE_REL);

2. Call housekeeping_update()

3. Assure that there is only tick_nohz_handler on specified cpu
in /proc/timer_list manually or with script:

grep -E 'cpu| #[0-9]' /proc/timer_list | \
awk "/cpu:/{y=0};/cpu: $test_cpu\$/{y=1};y"

Another alternative solution to migrate hrtimers:
1. Use cpuhp to set sched_timer offline
2. Resettle all hrtimers likewise migrate_hrtimer_list
3. Use cpuhp to set sched_timer online

Signed-off-by: Costa Shulyupin <costa.shul@xxxxxxxxxx>
---
include/linux/hrtimer.h | 2 +
kernel/sched/isolation.c | 2 +
kernel/time/hrtimer.c | 81 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 85 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index aa1e65ccb6158..004632fc7d643 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -381,8 +381,10 @@ extern void sysrq_timer_list_show(void);
int hrtimers_prepare_cpu(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_cpu_dying(unsigned int cpu);
+void hrtimers_resettle_from_cpu(unsigned int cpu);
#else
#define hrtimers_cpu_dying NULL
+static inline void hrtimers_resettle_from_cpu(unsigned int cpu) { }
#endif

#endif
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3b63f0212887e..85a17d39d8bb0 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -126,10 +126,12 @@ static void resettle_all_timers(cpumask_var_t enable_mask, cpumask_var_t disable

for_each_cpu(cpu, enable_mask) {
timers_prepare_cpu(cpu);
+ hrtimers_prepare_cpu(cpu);
}

for_each_cpu(cpu, disable_mask) {
timers_resettle_from_cpu(cpu);
+ hrtimers_resettle_from_cpu(cpu);
}
}

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 492c14aac642b..7e71ebbb72348 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2201,6 +2201,87 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}

+/*
+ * migrate_hrtimer_list_except - migrates hrtimers from one base to another,
+ * except specified one.
+ */
+static void migrate_hrtimer_list_except(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base, struct hrtimer *except)
+{
+ struct hrtimer *timer;
+ struct timerqueue_node *node;
+
+ node = timerqueue_getnext(&old_base->active);
+ while (node) {
+ timer = container_of(node, struct hrtimer, node);
+ node = timerqueue_iterate_next(node);
+ if (timer == except)
+ continue;
+
+ BUG_ON(hrtimer_callback_running(timer));
+ debug_deactivate(timer);
+
+ /*
+ * Mark it as ENQUEUED not INACTIVE otherwise the
+ * timer could be seen as !active and just vanish away
+ * under us on another CPU
+ */
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+ timer->base = new_base;
+ /*
+ * Enqueue the timers on the new cpu. This does not
+ * reprogram the event device in case the timer
+ * expires before the earliest on this CPU, but we run
+ * hrtimer_interrupt after we migrated everything to
+ * sort out already expired timers and reprogram the
+ * event device.
+ */
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+ }
+}
+
+/**
+ * hrtimers_resettle_from_cpu - resettles hrtimers from
+ * specified cpu to housekeeping cpus.
+ */
+void hrtimers_resettle_from_cpu(unsigned int isol_cpu)
+{
+ int ncpu, i;
+ struct tick_sched *ts = tick_get_tick_sched(isol_cpu);
+ struct hrtimer_cpu_base *old_base, *new_base;
+
+ local_irq_disable();
+ ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ old_base = &per_cpu(hrtimer_bases, isol_cpu);
+ new_base = &per_cpu(hrtimer_bases, ncpu);
+
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ migrate_hrtimer_list_except(&old_base->clock_base[i],
+ &new_base->clock_base[i],
+ &ts->sched_timer);
+ }
+
+ /*
+ * The migration might have changed the first expiring softirq
+ * timer on this CPU. Update it.
+ */
+ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+
+ raw_spin_unlock(&new_base->lock);
+ raw_spin_unlock(&old_base->lock);
+ local_irq_enable();
+
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
+}
+
int hrtimers_cpu_dying(unsigned int dying_cpu)
{
int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
--
2.45.0