[tip: timers/core] timers/migration: Split per-capacity hierarchies

From: tip-bot2 for Frederic Weisbecker

Date: Wed May 06 2026 - 03:17:13 EST

The following commit has been merged into the timers/core branch of tip:

Commit-ID: 098cbaad8e573cf6cac9e68e7ca2e7b7363d2434
Gitweb: https://git.kernel.org/tip/098cbaad8e573cf6cac9e68e7ca2e7b7363d2434
Author: Frederic Weisbecker <frederic@xxxxxxxxxx>
AuthorDate: Thu, 23 Apr 2026 18:53:52 +02:00
Committer: Thomas Gleixner <tglx@xxxxxxxxxx>
CommitterDate: Wed, 06 May 2026 08:33:07 +02:00

timers/migration: Split per-capacity hierarchies

Systems with heterogeneous CPU capacities, such as big.LITTLE, have
reported power issues since the introduction of the new timer migration
code.

Timers migrate from small capacity CPUs to big ones, degrading their
target residency and thus overall power consumption.

Solve this with splitting hierarchies per CPU capacity. For example in
a big.LITTLE machine, split a single hierarchy in two: one for big
capacity CPUs and another one for small capacity CPUs. This way global
timers only migrate across CPUs of the same capacity.

For simplicity purpose, split hierarchies keep the same number of
possible levels as if there were a single hierarchy, even though the
CPUs are distributed between multiple hierarchies. This could be a
problem on NUMA systems with heterogeneous CPU capacities (provided that
ever exists yet) where useless intermediate nodes may be created.
Solving this properly will imply on boot to know in advance how many
capacities are available and the number of CPUs for each of them.

Reported-by: Sehee Jeong <sehee1.jeong@xxxxxxxxxxx>
Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxx>
Link: https://patch.msgid.link/20260423165354.95152-5-frederic@xxxxxxxxxx
---
kernel/time/timer_migration.c | 123 ++++++++++++++++++++++++---------
kernel/time/timer_migration.h | 7 ++-
2 files changed, 100 insertions(+), 30 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index a68b9c7..03ae8c7 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -417,7 +417,7 @@

static DEFINE_MUTEX(tmigr_mutex);

-static struct tmigr_hierarchy *hierarchy;
+static LIST_HEAD(tmigr_hierarchy_list);

static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
@@ -1889,6 +1889,12 @@ static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
+ } else if (start) {
+ union tmigr_state state;
+
+ /* Remote activation assumes the whole target's hierarchy is inactive */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(state.active);
}

/* Root update */
@@ -1907,34 +1913,78 @@ out:
return err;
}

-static struct tmigr_hierarchy *tmigr_get_hierarchy(void)
+static struct tmigr_hierarchy *tmigr_get_hierarchy(unsigned int capacity)
{
- if (hierarchy)
- return hierarchy;
+ struct tmigr_hierarchy *hier = NULL, *iter;
+
+ list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+ if (iter->capacity == capacity)
+ hier = iter;
+ }
+
+ if (hier)
+ return hier;

- hierarchy = kzalloc(sizeof(*hierarchy), GFP_KERNEL);
- if (!hierarchy)
+ hier = kzalloc(sizeof(*hier), GFP_KERNEL);
+ if (!hier)
return ERR_PTR(-ENOMEM);

- hierarchy->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!hierarchy->cpumask)
+ hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!hier->cpumask)
goto err;

- hierarchy->level_list = kzalloc_objs(struct list_head, tmigr_hierarchy_levels);
- if (!hierarchy->level_list)
+ hier->level_list = kzalloc_objs(struct list_head, tmigr_hierarchy_levels);
+ if (!hier->level_list)
goto err;

for (int i = 0; i < tmigr_hierarchy_levels; i++)
- INIT_LIST_HEAD(&hierarchy->level_list[i]);
+ INIT_LIST_HEAD(&hier->level_list[i]);

- return hierarchy;
+ hier->capacity = capacity;
+ list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+ return hier;
err:
- kfree(hierarchy->cpumask);
- kfree(hierarchy);
- hierarchy = NULL;
+ kfree(hier->cpumask);
+ kfree(hier);
return ERR_PTR(-ENOMEM);
}

+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+ struct tmigr_group *old_root, bool activate)
+{
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == smp_processor_id());
+ if (activate) {
+ /*
+ * The current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+ }
+
+ return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+ struct tmigr_group *old_root = arg;
+ struct tmigr_hierarchy *hier;
+ int cpu = smp_processor_id();
+
+ hier = tmigr_get_hierarchy(arch_scale_cpu_capacity(cpu));
+ if (IS_ERR(hier))
+ return PTR_ERR(hier);
+
+ return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
static int tmigr_add_cpu(unsigned int cpu)
{
struct tmigr_hierarchy *hier;
@@ -1944,7 +1994,7 @@ static int tmigr_add_cpu(unsigned int cpu)

guard(mutex)(&tmigr_mutex);

- hier = tmigr_get_hierarchy();
+ hier = tmigr_get_hierarchy(arch_scale_cpu_capacity(cpu));
if (IS_ERR(hier))
return PTR_ERR(hier);

@@ -1957,20 +2007,33 @@ static int tmigr_add_cpu(unsigned int cpu)

/* Root has changed? Connect the old one to the new */
if (old_root && old_root != hier->root) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
- /*
- * The (likely) current CPU is expected to be online in the hierarchy,
- * otherwise the old root may not be active as expected.
- */
- WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
- ret = tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, true);
+ guard(migrate)();
+
+ if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+ /*
+ * If the target belong to the same hierarchy, the old root is expected
+ * to be active. Link and propagate to the new root.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+ } else {
+ int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+ if (target < nr_cpu_ids) {
+ /*
+ * If the target doesn't belong to the same hierarchy as the current
+ * CPU, activate from a relevant one to make sure the old root is
+ * active.
+ */
+ ret = work_on_cpu(target, connect_old_root_work, old_root);
+ } else {
+ /*
+ * No other available CPUs in the remote hierarchy. Link the
+ * old root remotely but don't propagate activation since the
+ * old root is not expected to be active.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+ }
+ }
}

if (ret >= 0)
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 0cfbb8d..291bfb6 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -7,14 +7,21 @@

/**
* struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
+ * Homogeneous systems have only one hierarchy.
+ * Heterogenous have one hierarchy per CPU capacity.
* @level_list: Per level lists of tmigr groups
* @cpumask: CPUs belonging to this hierarchy
* @root: The current root of the hierarchy
+ * @capacity: CPU capacity associated to this hierarchy
+ * @node: Node in the global hierarchy list
*/
struct tmigr_hierarchy {
struct list_head *level_list;
struct cpumask *cpumask;
struct tmigr_group *root;
+ unsigned long capacity;
+ struct list_head node;
+
};

/**