Re: [6.5.0-rc1] unchecked MSR access error: RDMSR from 0xe2 at rIP: 0xffffffff87090227 (native_read_msr+0x7/0x40) (intel_idle_init_cstates_icpu)

From: Bruno Goncalves
Date: Tue Jul 11 2023 - 08:29:26 EST


On Tue, 11 Jul 2023 at 13:27, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Tue, Jul 11, 2023 at 11:11:44AM +0200, Bruno Goncalves wrote:
> > Hello Arjan and Rafael,
> >
> > I'm not sure if you are the right person to contact about this, but
> > we've noticed the following call trace when booting on KVM machines
> > using kernel 6.5.0-rc1:
> >
> > [ 1.089740] io scheduler kyber registered
> > [ 1.091080] io scheduler bfq registered
> > [ 1.094696] atomic64_test: passed for x86-64 platform with CX8 and with SSE
> > [ 1.097371] shpchp: Standard Hot Plug PCI Controller Driver version: 0.4
> > [ 1.099559] unchecked MSR access error: RDMSR from 0xe2 at rIP:
> > 0xffffffffa60912d7 (native_read_msr+0x7/0x40)
> > [ 1.102725] Call Trace:
> > [ 1.103547] <TASK>
> > [ 1.104236] ? ex_handler_msr+0x121/0x130
> > [ 1.105545] ? fixup_exception+0x234/0x310
> > [ 1.106858] ? gp_try_fixup_and_notify+0x1d/0xb0
> > [ 1.108342] ? exc_general_protection+0x147/0x420
> > [ 1.109838] ? pcpu_block_refresh_hint+0x87/0xb0
> > [ 1.111309] ? asm_exc_general_protection+0x26/0x30
> > [ 1.112866] ? native_read_msr+0x7/0x40
> > [ 1.114099] intel_idle_init_cstates_icpu.constprop.0+0x762/0x7f0
> > [ 1.116063] intel_idle_init+0x31b/0x4c0
> > [ 1.117315] ? __pfx_intel_idle_init+0x10/0x10
> > [ 1.118734] do_one_initcall+0x5a/0x320
> > [ 1.119981] kernel_init_freeable+0x31b/0x470
> > [ 1.121373] ? __pfx_kernel_init+0x10/0x10
> > [ 1.122681] kernel_init+0x1a/0x1c0
> > [ 1.123796] ret_from_fork+0x29/0x50
> > [ 1.124963] </TASK>
> > [ 1.126012] input: Power Button as
> > /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
> > [ 1.128477] ACPI: button: Power Button [PWRF]
> > [ 1.163767] ACPI: \_SB_.LNKB: Enabled at IRQ 10
> > [ 1.193831] ACPI: \_SB_.LNKC: Enabled at IRQ 11
>
> You forgot to say on what host hardware, what qemu options and you
> forgot to run the above through scripts/decode_stacktrace.sh.

I don't have access to the host machines, but the information I could
get so far is:

R640 2x Intel(R) Xeon(R) Silver 4116 CPU @ 2.10GHz

and start the VM with:

/usr/libexec/qemu-kvm -name guest=kvm-06-guest10,debug-threads=on -S
-object {"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-3-kvm-06-guest10/master-key.aes"}
-machine pc-i440fx-rhel7.6.0,usb=off,dump-guest-core=off,memory-backend=pc.ram
-accel kvm -cpu
Skylake-Server-IBRS,ss=on,vmx=on,pdcm=on,hypervisor=on,tsc-adjust=on,clflushopt=on,umip=on,pku=on,md-clear=on,stibp=on,arch-capabilities=on,ssbd=on,xsaves=on,ibpb=on,ibrs=on,amd-stibp=on,amd-ssbd=on,rsba=on,skip-l1dfl-vmentry=on,pschange-mc-no=on
-m 4096 -object
{"qom-type":"memory-backend-file","id":"pc.ram","mem-path":"/dev/hugepages/libvirt/qemu/3-kvm-06-guest10","x-use-canonical-path-for-ramblock-id":false,"prealloc":true,"size":4294967296,"host-nodes":[0],"policy":"bind"}
-overcommit mem-lock=off -smp 2,sockets=2,cores=1,threads=1 -uuid
cb26926d-fdcd-414e-a259-4fe99235d5bf -display none -no-user-config
-nodefaults -chardev socket,id=charmonitor,fd=45,server=on,wait=off
-mon chardev=charmonitor,id=monitor,mode=control -rtc
base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet
-no-shutdown -boot strict=on -device
piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -blockdev
{"driver":"file","filename":"/var/lib/libvirt/images/kvm-06-guest10.qcow2","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"}
-blockdev {"node-name":"libvirt-1-format","read-only":false,"driver":"qcow2","file":"libvirt-1-storage","backing":null}
-device virtio-blk-pci,bus=pci.0,addr=0x3,drive=libvirt-1-format,id=virtio-disk0,bootindex=2
-netdev tap,fd=46,id=hostnet0,vhost=on,vhostfd=48 -device
virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:9d:24:62,bootindex=1,bus=pci.0,addr=0x2
-chardev socket,id=charserial0,fd=44,server=on,wait=off -device
isa-serial,chardev=charserial0,id=serial0 -audiodev
{"id":"audio1","driver":"none"} -device
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -sandbox
on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny
-msg timestamp=on

The decoded call trace:

[ 14.476177] shpchp: Standard Hot Plug PCI Controller Driver version: 0.4
[ 14.478145] unchecked MSR access error: RDMSR from 0xe2 at rIP:
0xffffffffb421dfd7 native_read_msr
(/builds/4620032464/workdir/./arch/x86/include/asm/msr.h:84
/builds/4620032464/workdir/./arch/x86/include/asm/msr.h:118)
[ 14.480268] Call Trace:
[ 14.480809] <TASK>
[ 14.481283] ? ex_handler_msr
(/builds/4618511676/workdir/arch/x86/mm/extable.c:191)
[ 14.482153] ? fixup_exception
(/builds/4618511676/workdir/arch/x86/mm/extable.c:305)
[ 14.483042] ? gp_try_fixup_and_notify (arch/x86/kernel/traps.c:702
(discriminator 1))
[ 14.484045] ? exc_general_protection (arch/x86/kernel/traps.c:762
(discriminator 1) arch/x86/kernel/traps.c:728 (discriminator 1))
[ 14.485085] ? asm_exc_general_protection
(/builds/4618511676/workdir/./arch/x86/include/asm/idtentry.h:564)
[ 14.486152] ? cpuidle_poll_state_init
(/builds/4618511676/workdir/drivers/cpuidle/poll_state.c:60)
[ 14.487195] ? native_read_msr
(/builds/4620032464/workdir/./arch/x86/include/asm/msr.h:84
/builds/4620032464/workdir/./arch/x86/include/asm/msr.h:118)
[ 14.488049] intel_idle_init_cstates_icpu.constprop.0
(/builds/4626306068/workdir/./arch/x86/include/asm/paravirt.h:190
/builds/4626306068/workdir/drivers/idle/intel_idle.c:1820
/builds/4626306068/workdir/drivers/idle/intel_idle.c:1974)
[ 14.489360] ? __pfx_intel_idle_init_cstates_icpu.constprop.0
(/builds/4626306068/workdir/drivers/idle/intel_idle.c:1958)
[ 14.490773] ? pcpu_alloc (/builds/4626306068/workdir/mm/percpu.c:1886)
[ 14.491577] intel_idle_init
(/builds/4626306068/workdir/drivers/idle/intel_idle.c:2205
/builds/4626306068/workdir/drivers/idle/intel_idle.c:2246)
[ 14.492430] ? __pfx_intel_idle_init
(/builds/4626306068/workdir/drivers/idle/intel_idle.c:2228)
[ 14.493387] ? add_device_randomness
(/builds/4620032464/workdir/drivers/char/random.c:918)
[ 14.494382] ? __pfx_intel_idle_init
(/builds/4626306068/workdir/drivers/idle/intel_idle.c:2228)
[ 14.495372] do_one_initcall (init/main.c:1232)
[ 14.496205] ? __pfx_do_one_initcall (init/main.c:1223)
[ 14.497169] ? kasan_set_track
(/builds/4620032464/workdir/mm/kasan/common.c:52 (discriminator 1))
[ 14.498033] ? __kasan_kmalloc
(/builds/4620032464/workdir/mm/kasan/common.c:374
/builds/4620032464/workdir/mm/kasan/common.c:383)
[ 14.498920] kernel_init_freeable (init/main.c:1293 (discriminator
1) init/main.c:1310 (discriminator 1) init/main.c:1329 (discriminator
1) init/main.c:1546 (discriminator 1))
[ 14.499897] ? __pfx_kernel_init (init/main.c:1429)
[ 14.500843] kernel_init (init/main.c:1439)
[ 14.501602] ? __pfx_kernel_init (init/main.c:1429)
[ 14.502490] ret_from_fork
(/builds/4618511676/workdir/arch/x86/entry/entry_64.S:308)
[ 14.503270] </TASK>
[ 14.507587] input: Power Button as
/devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
[ 14.510321] ACPI: button: Power Button [PWRF]
[-- MARK -- Mon Jul 10 18:40:00 2023]

Bruno