RE: [PATCH 1/1] sched/topology: Make sched_init_numa() use a set for the deduplicating sort

From: Valentin Schneider
Date: Mon Jan 25 2021 - 22:15:27 EST


On 25/01/21 02:23, Song Bao Hua (Barry Song) wrote:

> with the below topology:
> qemu-system-aarch64 -M virt -nographic \
> -smp cpus=8 \
> -numa node,cpus=0-1,nodeid=0 \
> -numa node,cpus=2-3,nodeid=1 \
> -numa node,cpus=4-5,nodeid=2 \
> -numa node,cpus=6-7,nodeid=3 \
> -numa dist,src=0,dst=1,val=12 \
> -numa dist,src=0,dst=2,val=20 \
> -numa dist,src=0,dst=3,val=22 \
> -numa dist,src=1,dst=2,val=22 \
> -numa dist,src=2,dst=3,val=12 \
> -numa dist,src=1,dst=3,val=24 \
>
>
> The panic address is *1294:
>
> if (sdd->sd) {
> 1280: f9400e61 ldr x1, [x19, #24]
> 1284: b4000201 cbz x1, 12c4 <build_sched_domains+0x414>
> sd = *per_cpu_ptr(sdd->sd, j);
> 1288: 93407eb7 sxtw x23, w21
> 128c: aa0103e0 mov x0, x1
> 1290: f8777ac2 ldr x2, [x22, x23, lsl #3]
> *1294: f8626800 ldr x0, [x0, x2]
> if (sd && (sd->flags & SD_OVERLAP))
> 1298: b4000120 cbz x0, 12bc <build_sched_domains+0x40c>
> 129c: b9403803 ldr w3, [x0, #56]
> 12a0: 365800e3 tbz w3, #11, 12bc
> <build_sched_domains+0x40c>
> free_sched_groups(sd->groups, 0);
> 12a4: f9400800 ldr x0, [x0, #16]
> if (!sg)
>

Thanks for giving it a shot, let me run that with your topology and see
where I end up.

> Thanks
> Barry
>
>> ---
>> include/linux/topology.h | 1 +
>> kernel/sched/topology.c | 99 +++++++++++++++++++---------------------
>> 2 files changed, 49 insertions(+), 51 deletions(-)
>>
>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>> index ad03df1cc266..7634cd737061 100644
>> --- a/include/linux/topology.h
>> +++ b/include/linux/topology.h
>> @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void);
>> /* Conform to ACPI 2.0 SLIT distance definitions */
>> #define LOCAL_DISTANCE 10
>> #define REMOTE_DISTANCE 20
>> +#define DISTANCE_BITS 8
>> #ifndef node_distance
>> #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE :
>> REMOTE_DISTANCE)
>> #endif
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index 5d3675c7a76b..bf5c9bd10bfe 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void)
>> }
>> }
>>
>> +
>> +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>> +
>> void sched_init_numa(void)
>> {
>> - int next_distance, curr_distance = node_distance(0, 0);
>> struct sched_domain_topology_level *tl;
>> - int level = 0;
>> - int i, j, k;
>> -
>> - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1),
>> GFP_KERNEL);
>> - if (!sched_domains_numa_distance)
>> - return;
>> -
>> - /* Includes NUMA identity node at level 0. */
>> - sched_domains_numa_distance[level++] = curr_distance;
>> - sched_domains_numa_levels = level;
>> + unsigned long *distance_map;
>> + int nr_levels = 0;
>> + int i, j;
>>
>> /*
>> * O(nr_nodes^2) deduplicating selection sort -- in order to find the
>> * unique distances in the node_distance() table.
>> - *
>> - * Assumes node_distance(0,j) includes all distances in
>> - * node_distance(i,j) in order to avoid cubic time.
>> */
>> - next_distance = curr_distance;
>> + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
>> + if (!distance_map)
>> + return;
>> +
>> + bitmap_zero(distance_map, NR_DISTANCE_VALUES);
>> for (i = 0; i < nr_node_ids; i++) {
>> for (j = 0; j < nr_node_ids; j++) {
>> - for (k = 0; k < nr_node_ids; k++) {
>> - int distance = node_distance(i, k);
>> -
>> - if (distance > curr_distance &&
>> - (distance < next_distance ||
>> - next_distance == curr_distance))
>> - next_distance = distance;
>> -
>> - /*
>> - * While not a strong assumption it would be nice to know
>> - * about cases where if node A is connected to B, B is not
>> - * equally connected to A.
>> - */
>> - if (sched_debug() && node_distance(k, i) != distance)
>> - sched_numa_warn("Node-distance not symmetric");
>> + int distance = node_distance(i, j);
>>
>> - if (sched_debug() && i && !find_numa_distance(distance))
>> - sched_numa_warn("Node-0 not representative");
>> + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
>> + sched_numa_warn("Invalid distance value range");
>> + return;
>> }
>> - if (next_distance != curr_distance) {
>> - sched_domains_numa_distance[level++] = next_distance;
>> - sched_domains_numa_levels = level;
>> - curr_distance = next_distance;
>> - } else break;
>> +
>> + bitmap_set(distance_map, distance, 1);
>> }
>> + }
>> + /*
>> + * We can now figure out how many unique distance values there are and
>> + * allocate memory accordingly.
>> + */
>> + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
>>
>> - /*
>> - * In case of sched_debug() we verify the above assumption.
>> - */
>> - if (!sched_debug())
>> - break;
>> + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int),
>> GFP_KERNEL);
>> + if (!sched_domains_numa_distance) {
>> + bitmap_free(distance_map);
>> + return;
>> + }
>> +
>> + for (i = 0, j = 0; i < nr_levels; i++, j++) {
>> + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
>> + sched_domains_numa_distance[i] = j;
>> }
>>
>> + bitmap_free(distance_map);
>> +
>> /*
>> - * 'level' contains the number of unique distances
>> + * 'nr_levels' contains the number of unique distances
>> *
>> * The sched_domains_numa_distance[] array includes the actual distance
>> * numbers.
>> @@ -1664,15 +1656,15 @@ void sched_init_numa(void)
>> /*
>> * Here, we should temporarily reset sched_domains_numa_levels to 0.
>> * If it fails to allocate memory for array sched_domains_numa_masks[][],
>> - * the array will contain less then 'level' members. This could be
>> + * the array will contain less then 'nr_levels' members. This could be
>> * dangerous when we use it to iterate array sched_domains_numa_masks[][]
>> * in other functions.
>> *
>> - * We reset it to 'level' at the end of this function.
>> + * We reset it to 'nr_levels' at the end of this function.
>> */
>> sched_domains_numa_levels = 0;
>>
>> - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
>> + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels,
>> GFP_KERNEL);
>> if (!sched_domains_numa_masks)
>> return;
>>
>> @@ -1680,7 +1672,7 @@ void sched_init_numa(void)
>> * Now for each level, construct a mask per node which contains all
>> * CPUs of nodes that are that many hops away from us.
>> */
>> - for (i = 0; i < level; i++) {
>> + for (i = 0; i < nr_levels; i++) {
>> sched_domains_numa_masks[i] =
>> kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
>> if (!sched_domains_numa_masks[i])
>> @@ -1688,12 +1680,17 @@ void sched_init_numa(void)
>>
>> for (j = 0; j < nr_node_ids; j++) {
>> struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
>> + int k;
>> +
>> if (!mask)
>> return;
>>
>> sched_domains_numa_masks[i][j] = mask;
>>
>> for_each_node(k) {
>> + if (sched_debug() && (node_distance(j, k) != node_distance(k,
>> j)))
>> + sched_numa_warn("Node-distance not symmetric");
>> +
>> if (node_distance(j, k) > sched_domains_numa_distance[i])
>> continue;
>>
>> @@ -1705,7 +1702,7 @@ void sched_init_numa(void)
>> /* Compute default topology size */
>> for (i = 0; sched_domain_topology[i].mask; i++);
>>
>> - tl = kzalloc((i + level + 1) *
>> + tl = kzalloc((i + nr_levels) *
>> sizeof(struct sched_domain_topology_level), GFP_KERNEL);
>> if (!tl)
>> return;
>> @@ -1728,7 +1725,7 @@ void sched_init_numa(void)
>> /*
>> * .. and append 'j' levels of NUMA goodness.
>> */
>> - for (j = 1; j < level; i++, j++) {
>> + for (j = 1; j < nr_levels; i++, j++) {
>> tl[i] = (struct sched_domain_topology_level){
>> .mask = sd_numa_mask,
>> .sd_flags = cpu_numa_flags,
>> @@ -1740,8 +1737,8 @@ void sched_init_numa(void)
>>
>> sched_domain_topology = tl;
>>
>> - sched_domains_numa_levels = level;
>> - sched_max_numa_distance = sched_domains_numa_distance[level - 1];
>> + sched_domains_numa_levels = nr_levels;
>> + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
>>
>> init_numa_topology_type();
>> }
>> --
>> 2.27.0