Re: [PATCH v4 1/2] sched: Create architecture specific sched domain distances
From: Chen, Yu C
Date: Mon Sep 29 2025 - 22:29:07 EST
On 9/30/2025 6:18 AM, Tim Chen wrote:
On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
[snip]
@@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
enum numa_topology_type sched_numa_topology_type;
static int sched_domains_numa_levels;
+static int sched_numa_node_levels;
I agree that the benefit of maintaining two NUMA distances - one for the
sched_domain and another for the NUMA balancing/page allocation policy - is
to avoid complicating the sched_domain hierarchy while preserving the
advantages of NUMA locality.
Meanwhile, I wonder if we could also add a "orig" prefix to the original
NUMA distance. This way, we can quickly understand its meaning later.
For example,
sched_orig_node_levels
sched_orig_node_distance
I am not sure adding orig will make the meaning any clearer.
I can add comments to note that
sched_numa_node_distance mean the node distance between numa nodes
sched_numa_nodel_levels mean the number of unique distances between numa nodes
OK, looks good to me.
static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
+static int *sched_numa_node_distance;
static struct cpumask ***sched_domains_numa_masks;
#endif /* CONFIG_NUMA */
@@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
return true;
rcu_read_lock();
- distances = rcu_dereference(sched_domains_numa_distance);
+ distances = rcu_dereference(sched_numa_node_distance);
if (!distances)
goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_node_levels; i++) {
if (distances[i] == distance) {
found = true;
break;
@@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static bool __modified_sched_node_dist = true;
+
+int __weak arch_sched_node_distance(int from, int to)
{
- struct sched_domain_topology_level *tl;
- unsigned long *distance_map;
+ if (__modified_sched_node_dist)
+ __modified_sched_node_dist = false;
+
+ return node_distance(from, to);
+}
+
+static bool modified_sched_node_distance(void)
+{
+ /*
+ * Call arch_sched_node_distance()
+ * to determine if arch_sched_node_distance
+ * has been modified from node_distance()
+ * to arch specific distance.
+ */
+ arch_sched_node_distance(0, 0);
+ return __modified_sched_node_dist;
+}
+
If our goal is to figure out whether the arch_sched_node_distance()
has been overridden, how about the following alias?
int __weak arch_sched_node_distance(int from, int to)
{
return __node_distance(from, to);
}
int arch_sched_node_distance_original(int from, int to) __weak
__alias(arch_sched_node_distance);
static bool arch_sched_node_distance_is_overridden(void)
{
return arch_sched_node_distance != arch_sched_node_distance_original;
}
so arch_sched_node_distance_is_overridden() can replace
modified_sched_node_distance()
I think that the alias version will still point to the replaced function and not
the originally defined one.
How about not using __weak and just explicitly define arch_sched_node_distance
as a function pointer. Change the code like below.
The arch_sched_node_distance_original is defined as __weak, so it
should point to the old function even if the function has been
overridden. I did a test on a X86 VM and it seems to be so.
But using the arch_sched_node_distance as a function point
should also be OK.
Tim
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d6b772990ec2..12db78af09d5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -545,7 +545,7 @@ static int avg_remote_numa_distance(void)
return sched_avg_remote_distance;
}
-int arch_sched_node_distance(int from, int to)
+static int x86_arch_sched_node_distance(int from, int to)
{
int d = node_distance(from, to);
@@ -918,6 +918,9 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
if (apic->wakeup_secondary_cpu_64)
start_ip = real_mode_header->trampoline_start64;
+#endif
+#ifdef CONFIG_NUMA
+ arch_sched_node_distance = x86_arch_sched_node_distance;
#endif
Above might be called for several APs, maybe we can just call it
once in smp_prepare_cpus_common().
thanks,
Chenyu
idle->thread.sp = (unsigned long)task_pt_regs(idle);
initial_code = (unsigned long)start_secondary;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2d2d29553df8..3549c4a19816 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -56,7 +56,7 @@ static inline int cpu_numa_flags(void)
{
return SD_NUMA;
}
-extern int arch_sched_node_distance(int from, int to);
+extern int (*arch_sched_node_distance)(int, int);
#endif
extern int arch_asym_cpu_priority(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f25e4402c63e..7cfb7422e9d4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1897,26 +1897,17 @@ static void init_numa_topology_type(int offline_node)
* A NUMA level is created for each unique
* arch_sched_node_distance.
*/
-static bool __modified_sched_node_dist = true;
-int __weak arch_sched_node_distance(int from, int to)
+static int default_sched_node_distance(int from, int to)
{
- if (__modified_sched_node_dist)
- __modified_sched_node_dist = false;
-
return node_distance(from, to);
}
+int (*arch_sched_node_distance)(int, int) = default_sched_node_distance;
+
static bool modified_sched_node_distance(void)
{
- /*
- * Call arch_sched_node_distance()
- * to determine if arch_sched_node_distance
- * has been modified from node_distance()
- * to arch specific distance.
- */
- arch_sched_node_distance(0, 0);
- return __modified_sched_node_dist;
+ return arch_sched_node_distance != default_sched_node_distance;
}
static int numa_node_dist(int i, int j)