Re: [tip: sched/core] sched/topology: Compute sd_weight considering cpuset partitions
From: K Prateek Nayak
Date: Sat Mar 21 2026 - 04:59:28 EST
Hello Chenyu,
On 3/21/2026 1:17 PM, Chen, Yu C wrote:
> On 3/21/2026 3:33 PM, Chen, Yu C wrote:
>> On 3/21/2026 11:36 AM, K Prateek Nayak wrote:
>>> sd->span_weight = cpumask_weight(sched_domain_span(sd));
>>>
>>> which should have crashed too if we had a NULL pointer in the
>>> cpumask range. So I'm at a loss. Maybe the pc points to a
>>> different location in your build?
>>>
>>
>> A wild guess, the major change is that we access sd->span, before
>> initializing the sd structure with *sd = { ... }. The sd is allocated
>> via alloc_percpu() uninitialized, the span at the end of the sd structure
>> remain uninitialized. It is unclear how cpumask_weight(sd->span) might be
>> affected by this uninitialized state. Before this patch, after *sd = { ... }
>> is executed, the contents of sd->span are explicitly set to 0, which might
>> be safer?
>>
>
> I replied too fast, please ignore above comments, the sd->span should have been
> set via cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu))
So I managed to reproduce the crash and it is actually crashing at:
last->next = first;
in build_sched_groups(). If I print the span befora nd after we do
the *sd = { ... }, I see:
[ 0.056301] span before: 0
[ 0.056559] span after:
[ 0.056686] span double check:
double check does a cpumask_pr_args(sched_domain_span(sd)).
This solves the crash on top of this patch:
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79bab80af8f2..b347ae5d2786 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1693,6 +1693,8 @@ sd_init(struct sched_domain_topology_level *tl,
.name = tl->name,
};
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
+
WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
(SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
"CPU capacity asymmetry not supported on SMT\n");
---
And I see:
[ 0.056479] span before: 0
[ 0.056749] span after: 0
[ 0.056881] span double check: 0
But since span[] is a variable array at the end of sched_domain struct,
doing a *sd = { ... } shouldn't modify it since the size isn't known at
compile time and the compiler will only overwrite the fixed fields.
Is there a compiler angle I'm missing here?
The cpumask_and() that comes first looks like:
@ kernel/sched/topology.c:1649: cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
ldr r3, [r9] @ MEM[(const struct cpumask * (*<T2127>) (struct sched_domain_topology_level *, int) *)tl_317], MEM[(const struct cpumask * (*<T2127>) (struct sched_domain_topology_level *, int) *)tl_317]
@ kernel/sched/topology.c:1646: u64 now = sched_clock();
strd r0, [sp, #28] @,,
@ kernel/sched/topology.c:1649: cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
mov r1, r6 @, i
mov r0, r9 @, ivtmp.1798
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
mov r4, fp @ tmp740, sd
@ kernel/sched/topology.c:1649: cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
blx r3 @ MEM[(const struct cpumask * (*<T2127>) (struct sched_domain_topology_level *, int) *)tl_317]
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
ldr r3, [r0] @ MEM[(const long unsigned int *)_356], MEM[(const long unsigned int *)_356]
ldr r0, [r7] @ MEM[(const long unsigned int *)cpu_map_104(D)], MEM[(const long unsigned int *)cpu_map_104(D)]
and r0, r0, r3 @ tmp736, MEM[(const long unsigned int *)cpu_map_104(D)], MEM[(const long unsigned int *)_356]
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
uxth r0, r0 @ _360, tmp736
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
str r0, [r4, #292]! @ _360, MEM[(long unsigned int *)sd_352 + 292B]
---
*sd assignment looks as follows in my disassembly:
.L1867:
@ kernel/sched/topology.c:1660: *sd = (struct sched_domain){
ldr ip, [sp, #48] @ tmp1203, %sfp
mov r2, #296 @,
mov r0, fp @, sd
mov r1, #0 @,
ldr r3, [ip] @ jiffies.324_453, jiffies
str r3, [sp, #36] @ jiffies.324_453, %sfp
ldr ip, [ip] @ jiffies.326_454, jiffies
@ kernel/sched/topology.c:1693: .name = tl->name,
ldr r3, [r9, #28] @ _455, MEM[(char * *)tl_317 + 28B]
str r3, [sp, #16] @ _455, %sfp
@ kernel/sched/topology.c:1660: *sd = (struct sched_domain){
str ip, [sp, #8] @ jiffies.326_454, %sfp
bl memset @
ldr r3, [sp, #36] @ jiffies.324_453, %sfp
ldr r2, [sp, #28] @ now, %sfp
str r3, [fp, #48] @ jiffies.324_453, sd_352->last_balance
ldr r3, [sp, #16] @ _455, %sfp
ldr ip, [sp, #8] @ jiffies.326_454, %sfp
str r2, [fp, #72] @ now, sd_352->newidle_stamp
str r3, [fp, #272] @ _455, sd_352->name
mov r3, #16 @ tmp1502,
ldr r2, [sp, #32] @ now, %sfp
str r3, [fp, #20] @ tmp1502, sd_352->busy_factor
@ kernel/sched/topology.c:1678: | sd_flags
orr r3, r4, #4096 @ _452, sd_flags,
@ kernel/sched/topology.c:1696: WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
and r4, r4, #160 @ tmp779, sd_flags,
@ kernel/sched/topology.c:1678: | sd_flags
orr r3, r3, #23 @ _452, _452,
@ kernel/sched/topology.c:1660: *sd = (struct sched_domain){
str r2, [fp, #76] @ now, sd_352->newidle_stamp
@ kernel/sched/topology.c:1696: WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
cmp r4, #160 @ tmp779,
@ kernel/sched/topology.c:1660: *sd = (struct sched_domain){
mov r2, #512 @ tmp776,
str ip, [fp, #88] @ jiffies.326_454, sd_352->last_decay_max_lb_cost
str r2, [fp, #60] @ tmp776, sd_352->newidle_call
str r2, [fp, #68] @ tmp776, sd_352->newidle_ratio
@ kernel/sched/topology.c:1662: .max_interval = 2*sd_weight,
lsl r2, r10, #1 @ tmp773, _484,
@ kernel/sched/topology.c:1660: *sd = (struct sched_domain){
str r5, [fp, #4] @ sd, sd_352->child
str r2, [fp, #16] @ tmp773, sd_352->max_interval
mov r2, #117 @ tmp775,
str r10, [fp, #12] @ _484, sd_352->min_interval
str r2, [fp, #24] @ tmp775, sd_352->imbalance_pct
mov r2, #256 @ tmp777,
str r10, [fp, #52] @ _484, sd_352->balance_interval
str r3, [fp, #40] @ _452, sd_352->flags
str r2, [fp, #64] @ tmp777, sd_352->newidle_success
---
If I add the new cpumask_and() I get the following after *sd assignment:
@ kernel/sched/topology.c:1696: cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
ldr r3, [r9] @ MEM[(const struct cpumask * (*<T2127>) (struct sched_domain_topology_level *, int) *)tl_317], MEM[(const struct cpumask * (*<T2127>) (struct sched_domain_topology_level *, int) *)tl_317]
blx r3 @ MEM[(const struct cpumask * (*<T2127>) (struct sched_domain_topology_level *, int) *)tl_317]
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
ldr r3, [r7] @ MEM[(const long unsigned int *)cpu_map_104(D)], MEM[(const long unsigned int *)cpu_map_104(D)]
ldr r2, [r0] @ MEM[(const long unsigned int *)_457], MEM[(const long unsigned int *)_457]
and r3, r3, r2 @ tmp788, MEM[(const long unsigned int *)cpu_map_104(D)], MEM[(const long unsigned int *)_457]
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
uxth r3, r3 @ tmp791, tmp788
@ ./include/linux/bitmap.h:329: return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
str r3, [fp, #292] @ tmp791, MEM[(long unsigned int *)sd_352 + 292B]
---
Both cpumask_and() seems to store to:
MEM[(long unsigned int *)sd_352 + 292B]
So I'm at a loss why this happens. Let me dig little more.
--
Thanks and Regards,
Prateek