Re: [PATCH v3 0/9] Parallel CPU bringup for x86_64

From: Sean Christopherson
Date: Fri Jan 28 2022 - 16:40:51 EST


On Fri, Jan 28, 2022, David Woodhouse wrote:
> On Fri, 2021-12-17 at 14:55 -0600, Tom Lendacky wrote:
> > On 12/17/21 2:13 PM, David Woodhouse wrote:
> > > On Fri, 2021-12-17 at 13:46 -0600, Tom Lendacky wrote:
> > > > There's no WARN or PANIC, just a reset. I can look to try and capture some
> > > > KVM trace data if that would help. If so, let me know what events you'd
> > > > like captured.
> > >
> > >
> > > Could start with just kvm_run_exit?
> > >
> > > Reason 8 would be KVM_EXIT_SHUTDOWN and would potentially indicate a
> > > triple fault.
> >
> > qemu-system-x86-24093 [005] ..... 1601.759486: kvm_exit: vcpu 112 reason shutdown rip 0xffffffff81070574 info1 0x0000000000000000 info2 0x0000000000000000 intr_info 0x80000b08 error_code 0x00000000
> >
> > # addr2line -e woodhouse-build-x86_64/vmlinux 0xffffffff81070574
> > /root/kernels/woodhouse-build-x86_64/./arch/x86/include/asm/desc.h:272
> >
> > Which is: asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
>
> So, I remain utterly bemused by this, and the Milan *guests* I have
> access to can't even kexec with a stock kernel; that is also "too fast"
> and they take a triple fault during the bringup in much the same way —
> even without my parallel patches, and even going back to fairly old
> kernels.
>
> I wasn't able to follow up with raw serial output during the bringup to
> pinpoint precisely where it happens, because the VM would tear itself
> down in response to the triple fault without actually flushing the last
> virtual serial output :)
>
> It would be really useful to get access to a suitable host where I can
> spawn this in qemu and watch it fail. I am suspecting a chip-specific
> quirk or bug at this point.

Nope. You missed a spot. This also reproduces on a sufficiently large Intel
system (and Milan). initial_gs gets overwritten by common_cpu_up(), which leads
to a CPU getting the wrong MSR_GS_BASE and then the wrong raw_smp_processor_id(),
resulting in cpu_init_exception_handling() stuffing the wrong GDT and leaving a
NULL TR descriptor for itself.

You also have a lurking bug in the x2APIC ID handling. Stripping the boot flags
from the prescribed APICID needs to happen before retrieving the x2APIC ID from
CPUID, otherwise bits 31:16 of the ID will be lost.

You owe me two beers ;-)

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index dcdf49a137d6..23df88c86a0e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -208,11 +208,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* in smpboot_control:
* Bit 0-15 APICID if STARTUP_USE_CPUID_0B is not set
* Bit 16 Secondary boot flag
- * Bit 17 Parallel boot flag
+ * Bit 17 Parallel boot flag (STARTUP_USE_CPUID_0B)
*/
testl $STARTUP_USE_CPUID_0B, %eax
- jz .Lsetup_AP
+ jnz .Luse_cpuid_0b
+ andl $0xFFFF, %eax
+ jmp .Lsetup_AP

+.Luse_cpuid_0b:
mov $0x0B, %eax
xorl %ecx, %ecx
cpuid
@@ -220,7 +223,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)

.Lsetup_AP:
/* EAX contains the APICID of the current CPU */
- andl $0xFFFF, %eax
xorl %ecx, %ecx
leaq cpuid_to_apicid(%rip), %rbx

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 04f5c8de5606..e7fda406f39a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1093,6 +1093,17 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
return boot_error;
}

+static bool do_parallel_bringup = true;
+
+static int __init no_parallel_bringup(char *str)
+{
+ do_parallel_bringup = false;
+
+ return 0;
+}
+early_param("no_parallel_bringup", no_parallel_bringup);
+
+
int common_cpu_up(unsigned int cpu, struct task_struct *idle)
{
int ret;
@@ -1112,7 +1123,8 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
/* Stack for startup_32 can be just as for start_secondary onwards */
per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
- initial_gs = per_cpu_offset(cpu);
+ if (!do_parallel_bringup)
+ initial_gs = per_cpu_offset(cpu);
#endif
return 0;
}
@@ -1336,16 +1348,6 @@ int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
return ret;
}

-static bool do_parallel_bringup = true;
-
-static int __init no_parallel_bringup(char *str)
-{
- do_parallel_bringup = false;
-
- return 0;
-}
-early_param("no_parallel_bringup", no_parallel_bringup);
-
int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int ret;