Re: [PATCH][RFC] workqueue: Fix kernel panic on CPU hot-unplug

From: Helge Deller
Date: Thu Feb 01 2024 - 12:56:35 EST


Hi Tejun,

On 2/1/24 17:54, Tejun Heo wrote:
On Thu, Feb 01, 2024 at 05:41:10PM +0100, Helge Deller wrote:
Hmm... I have a hard time imagining a scenario where some CPUs don't have
pwq installed on wq->cpu_pwq. Can you please run `drgn
tools/workqueue/wq_dump.py` before triggering the hotplug event and paste
the output along with full dmesg?

Enabling CONFIG_DEBUG_INFO=y did the trick :-)


root@debian:~# drgn --main-symbols -s ./vmlinux ./wq_dump.py 2>&1 | tee L
Affinity Scopes
===============
wq_unbound_cpumask=0000ffff

CPU
nr_pods 16
pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 [4]=00000010 [5]=00000020 [6]=00000040 [7]=00000080 [8]=00000100 [9]=00000200 [10]=00000400 [11]=00000800 [12]=00001000 [13]=00002000 [14]=00004000 [15]=00008000
pod_node [0]=0 [1]=0 [2]=0 [3]=0 [4]=0 [5]=0 [6]=0 [7]=0 [8]=0 [9]=0 [10]=0 [11]=0 [12]=0 [13]=0 [14]=0 [15]=0
cpu_pod [0]=0 [1]=1

SMT
nr_pods 16
pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 [4]=00000010 [5]=00000020 [6]=00000040 [7]=00000080 [8]=00000100 [9]=00000200 [10]=00000400 [11]=00000800 [12]=00001000 [13]=00002000 [14]=00004000 [15]=00008000
pod_node [0]=0 [1]=0 [2]=0 [3]=0 [4]=0 [5]=0 [6]=0 [7]=0 [8]=0 [9]=0 [10]=0 [11]=0 [12]=0 [13]=0 [14]=0 [15]=0
cpu_pod [0]=0 [1]=1

CACHE (default)
nr_pods 1
pod_cpus [0]=0000ffff
pod_node [0]=0
cpu_pod [0]=0 [1]=0

NUMA
nr_pods 1
pod_cpus [0]=0000ffff
pod_node [0]=0
cpu_pod [0]=0 [1]=0

SYSTEM
nr_pods 1
pod_cpus [0]=0000ffff
pod_node [0]=-1
cpu_pod [0]=0 [1]=0

Worker Pools
============
pool[00] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 0
pool[01] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 0
pool[02] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 1
pool[03] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 1
pool[04] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 2
pool[05] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 2
pool[06] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 3
pool[07] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 3
pool[08] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 4
pool[09] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 4
pool[10] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 5
pool[11] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 5
pool[12] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 6
pool[13] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 6
pool[14] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 7
pool[15] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 7
pool[16] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 8
pool[17] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 8
pool[18] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 9
pool[19] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 9
pool[20] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 10
pool[21] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 10
pool[22] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 11
pool[23] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 11
pool[24] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 12
pool[25] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 12
pool[26] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 13
pool[27] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 13
pool[28] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 14
pool[29] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 14
pool[30] ref= 1 nice= 0 idle/workers= 0/ 0 cpu= 15
pool[31] ref= 1 nice=-20 idle/workers= 0/ 0 cpu= 15
pool[32] ref=28 nice= 0 idle/workers= 8/ 8 cpus=0000ffff pod_cpus=0000ffff

Workqueue CPU -> pool
=====================
[ workqueue \ type CPU 0 1 dfl]
events percpu 0 2
events_highpri percpu 1 3
events_long percpu 0 2
events_unbound unbound 32 32 32
events_freezable percpu 0 2
events_power_efficient percpu 0 2
events_freezable_power_ percpu 0 2
rcu_gp percpu 0 2
rcu_par_gp percpu 0 2
slub_flushwq percpu 0 2
netns ordered 32 32 32
mm_percpu_wq percpu 0 2
inet_frag_wq percpu 0 2
cgroup_destroy percpu 0 2
cgroup_pidlist_destroy percpu 0 2
cgwb_release percpu 0 2
writeback unbound 32 32 32
kintegrityd percpu 1 3
kblockd percpu 1 3
blkcg_punt_bio unbound 32 32 32
ata_sff percpu 0 2
usb_hub_wq percpu 0 2
inode_switch_wbs percpu 0 2
virtio-blk percpu 0 2
scsi_tmf_0 ordered 32 32 32
psmouse-smbus percpu 0 2
kpsmoused ordered 32 32 32
sock_diag_events percpu 0 2
kstrp ordered 32 32 32
ext4-rsv-conversion ordered 32 32 32
root@debian:~#
root@debian:~# lscpu
Architecture: parisc
Byte Order: Big Endian
CPU(s): 2
On-line CPU(s) list: 0,1
Model name: PA7300LC (PCX-L2)
CPU family: PA-RISC 1.1e
Model: 9000/778/B160L - Merlin L2 160 (9000/778/B160L)
Thread(s) per core: 1
Core(s) per socket: 1
Socket(s): 2
BogoMIPS: 2446.13
root@debian:~#
root@debian:~# chcpu -d 1
[ 261.926353] Backtrace:
[ 261.928292] [<10448744>] workqueue_offline_cpu+0x1d4/0x1dc
[ 261.928292] [<10429db4>] cpuhp_invoke_callback+0xf8/0x200
[ 261.928292] [<1042a1d0>] cpuhp_thread_fun+0xb8/0x164
[ 261.928292] [<10452970>] smpboot_thread_fn+0x284/0x288
[ 261.928292] [<1044d8f4>] kthread+0x12c/0x13c
[ 261.928292] [<1040201c>] ret_from_kernel_thread+0x1c/0x24
[ 261.928292]
[ 261.928292]
[ 261.928292] Kernel Fault: Code=26 (Data memory access rights trap) at addr 00000000
[ 261.928292] CPU: 1 PID: 21 Comm: cpuhp/1 Not tainted 6.8.0-rc1-32bit+ #1293
[ 261.928292] Hardware name: 9000/778/B160L
[ 261.928292]
[ 261.928292] YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
[ 261.928292] PSW: 00000000000001101111111100001111 Not tainted
[ 261.928292] r00-03 0006ff0f 11011540 10446d9c 11e00500
[ 261.928292] r04-07 11c0b800 00000002 11c0d000 00000001
[ 261.928292] r08-11 110194e4 11018f08 00000000 00000004
[ 261.928292] r12-15 10c78800 00000612 f0028050 f0027fd8
[ 261.928292] r16-19 fffffffc fee01180 f0027ed8 01735000
[ 261.928292] r20-23 0000ffff 1249cc00 1249cc00 00000000
[ 261.928292] r24-27 11c0c580 11c0d004 11c0d000 10ceb708
[ 261.928292] r28-31 00000000 0000000e 11e00580 00000018
[ 261.928292] sr00-03 00000000 00000000 00000000 000004be
[ 261.928292] sr04-07 00000000 00000000 00000000 00000000
[ 261.928292]
[ 261.928292] IASQ: 00000000 00000000 IAOQ: 10446db4 10446db8
[ 261.928292] IIR: 0f80109c ISR: 00000000 IOR: 00000000
[ 261.928292] CPU: 1 CR30: 11dd1710 CR31: 00000000
[ 261.928292] ORIG_R28: 00000612
[ 261.928292] IAOQ[0]: wq_update_pod+0x98/0x14c
[ 261.928292] IAOQ[1]: wq_update_pod+0x9c/0x14c
[ 261.928292] RP(r2): wq_update_pod+0x80/0x14c
[ 261.928292] Backtrace:
[ 261.928292] [<10448744>] workqueue_offline_cpu+0x1d4/0x1dc
[ 261.928292] [<10429db4>] cpuhp_invoke_callback+0xf8/0x200
[ 261.928292] [<1042a1d0>] cpuhp_thread_fun+0xb8/0x164
[ 261.928292] [<10452970>] smpboot_thread_fn+0x284/0x288
[ 261.928292] [<1044d8f4>] kthread+0x12c/0x13c
[ 261.928292] [<1040201c>] ret_from_kernel_thread+0x1c/0x24
[ 261.928292]
[ 261.928292] Kernel panic - not syncing: Kernel Fault