Re: Kernel 6.9 regression: X86: Bogus messages from topology detection

From: Peter Schneider
Date: Fri May 31 2024 - 02:53:05 EST


Hi Thomas,


Am 30.05.2024 um 18:24 schrieb Thomas Gleixner:

>
> The proper fix is obviously to unlock CPUID on Intel _before_ anything
> which depends on cpuid_level is evaluated.
>
> Thanks,
>
> tglx
> ---
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -969,7 +969,7 @@ static void init_speculation_control(str
> }
> }
>
> -void get_cpu_cap(struct cpuinfo_x86 *c)
> +static void get_cpu_cap(struct cpuinfo_x86 *c)
> {
> u32 eax, ebx, ecx, edx;
>
> @@ -1585,6 +1585,7 @@ static void __init early_identify_cpu(st
> if (have_cpuid_p()) {
> cpu_detect(c);
> get_cpu_vendor(c);
> + intel_unlock_cpuid_leafs(c);
> get_cpu_cap(c);
> setup_force_cpu_cap(X86_FEATURE_CPUID);
> get_cpu_address_sizes(c);
> @@ -1744,7 +1745,7 @@ static void generic_identify(struct cpui
> cpu_detect(c);
>
> get_cpu_vendor(c);
> -
> + intel_unlock_cpuid_leafs(c);
> get_cpu_cap(c);
>
> get_cpu_address_sizes(c);
> --- a/arch/x86/kernel/cpu/cpu.h
> +++ b/arch/x86/kernel/cpu/cpu.h
> @@ -61,14 +61,15 @@ extern __ro_after_init enum tsx_ctrl_sta
>
> extern void __init tsx_init(void);
> void tsx_ap_init(void);
> +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
> #else
> static inline void tsx_init(void) { }
> static inline void tsx_ap_init(void) { }
> +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
> #endif /* CONFIG_CPU_SUP_INTEL */
>
> extern void init_spectral_chicken(struct cpuinfo_x86 *c);
>
> -extern void get_cpu_cap(struct cpuinfo_x86 *c);
> extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
> extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
> extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -269,19 +269,26 @@ static void detect_tme_early(struct cpui
> c->x86_phys_bits -= keyid_bits;
> }
>
> +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
> +{
> + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
> + return;
> +
> + if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd))
> + return;
> +
> + /*
> + * The BIOS can have limited CPUID to leaf 2, which breaks feature
> + * enumeration. Unlock it and update the maximum leaf info.
> + */
> + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
> + c->cpuid_level = cpuid_eax(0);
> +}
> +
> static void early_init_intel(struct cpuinfo_x86 *c)
> {
> u64 misc_enable;
>
> - /* Unmask CPUID levels if masked: */
> - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
> - if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
> - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
> - c->cpuid_level = cpuid_eax(0);
> - get_cpu_cap(c);
> - }
> - }
> -
> if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
> (c->x86 == 0x6 && c->x86_model >= 0x0e))
> set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
>


With that patch applied, I now get a build error:

CC [M] drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp.o
CC [M] drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp1_execution.o
CC [M] drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp1_transition.o
CC [M] drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.o
CC [M] drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_transition.o
LD [M] drivers/gpu/drm/amd/amdgpu/amdgpu.o
AR drivers/gpu/built-in.a
AR drivers/built-in.a
make[1]: *** [/usr/src/linux/Makefile:1919: .] Fehler 2
make: *** [Makefile:240: __sub-make] Fehler 2
root@linus:/usr/src/linux# make
CALL scripts/checksyscalls.sh
DESCEND objtool
INSTALL libsubcmd_headers
DESCEND bpf/resolve_btfids
INSTALL libsubcmd_headers
CC arch/x86/xen/enlighten_pv.o
arch/x86/xen/enlighten_pv.c: In Funktion »xen_start_kernel«:
arch/x86/xen/enlighten_pv.c:1388:9: Fehler: Implizite Deklaration der Funktion »get_cpu_cap«; meinten Sie »set_cpu_cap«? [-Werror=implicit-function-declaration]
1388 | get_cpu_cap(&boot_cpu_data);
| ^~~~~~~~~~~
| set_cpu_cap
cc1: Einige Warnungen werden als Fehler behandelt
make[4]: *** [scripts/Makefile.build:244: arch/x86/xen/enlighten_pv.o] Fehler 1
make[3]: *** [scripts/Makefile.build:485: arch/x86/xen] Fehler 2
make[2]: *** [scripts/Makefile.build:485: arch/x86] Fehler 2
make[1]: *** [/usr/src/linux/Makefile:1919: .] Fehler 2
make: *** [Makefile:240: __sub-make] Fehler 2
root@linus:/usr/src/linux#


I used the kernel config of my Proxmox VE kernel, like so:

root@linus:/usr/src/linux# cp /boot/config-6.5.13-5-pve .config

and then ran "make olddefconfig", and then "make -j 48". That's how I tested all these patches, including Martin's previously mentionened SCSI patch, and this used to work. I have attached the .config file.

I am not a C programmer, let alone a kernel dev, so please bear with me if this is nonsense, but: could the reason be that with your change, you have removed the declaration of get_cpu_cap from the cpu.h header file, while it is still being referenced in arch/x86/xen/enlighten_pv.c like so:

#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */

Should I try to just add it back in, and see if that works? Or would you prefer to look more deeply at this first, and then send me a reworked patch?

Beste Grüße,
Peter Schneider

--
Climb the mountain not to plant your flag, but to embrace the challenge,
enjoy the air and behold the view. Climb it so you can see the world,
not so the world can see you. -- David McCullough Jr.

OpenPGP: 0xA3828BD796CCE11A8CADE8866E3A92C92C3FF244
Download: https://www.peters-netzplatz.de/download/pschneider1968_pub.asc
https://keys.mailvelope.com/pks/lookup?op=get&search=pschneider1968@xxxxxxxxxxxxxx
https://keys.mailvelope.com/pks/lookup?op=get&search=pschneider1968@xxxxxxxxx

Attachment: .config
Description: XML document

Attachment: OpenPGP_signature.asc
Description: OpenPGP digital signature