Re: [PATCH v12 06/11] x86/smpboot: Remove initial_stack on 64-bit

From: David Woodhouse
Date: Wed Mar 01 2023 - 05:43:17 EST


On Wed, 2023-03-01 at 11:18 +0100, Thomas Gleixner wrote:
>
> > By the time it's taken off, won't ->thread.sp have been written out
> > again? I figured it was just a dead variable while the actual %rsp was
> > in use?
>
> Yes. It's not used when the thread is on the CPU. And you are right,
> it's saved and restored in switch_to(). Can you please add a comment to
> that effect?

Pushed to my v12bis branch:

From ac86ad32434d4661db0bef6791b7953d369585e2 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@xxxxxxxxx>
Date: Fri, 24 Feb 2023 10:42:31 -0500
Subject: [PATCH 06/11] x86/smpboot: Remove initial_stack on 64-bit

In order to facilitate parallel startup, start to eliminate some of the
global variables passing information to CPUs in the startup path.

However, start by introducing one more: smpboot_control. For now this
merely holds the CPU# of the CPU which is coming up. Each CPU can then
find its own per-cpu data, and everything else it needs can be found
from there, allowing the other global variables to be removed.

First to be removed is initial_stack. Each CPU can load %rsp from its
current_task->thread.sp instead. That is already set up with the correct
idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
the BSP also finds a suitable stack pointer in the static per-cpu data
when coming up on first boot.

On resume from S3, the CPU needs a temporary stack because its idle task
is already active. Instead of setting initial_stack, the sleep code can
simply set its own current->thread.sp to point to the temporary stack.
Nobody else cares about ->thread.sp for a thread which is currently on
a CPU, because the true value is actually in the %rsp register. Which
is restored with the rest of the CPU context in do_suspend_lowlevel().

Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx>
Reviewed-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
Tested-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
Signed-off-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
Tested-by: Guilherme G. Piccoli <gpiccoli@xxxxxxxxxx>
---
arch/x86/include/asm/processor.h | 6 ++++-
arch/x86/include/asm/smp.h | 5 +++-
arch/x86/kernel/acpi/sleep.c | 20 +++++++++++++--
arch/x86/kernel/asm-offsets.c | 1 +
arch/x86/kernel/head_64.S | 43 +++++++++++++++++++++-----------
arch/x86/kernel/smpboot.c | 7 +++++-
arch/x86/xen/xen-head.S | 2 +-
7 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4e35c66edeb7..bdde7316e75b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -648,7 +648,11 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)

#else
-#define INIT_THREAD { }
+extern unsigned long __end_init_task[];
+
+#define INIT_THREAD { \
+ .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
+}

extern unsigned long KSTK_ESP(struct task_struct *task);

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index b4dbb20dab1a..bf2c51df9e0b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -199,5 +199,8 @@ extern void nmi_selftest(void);
#define nmi_selftest() do { } while (0)
#endif

-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3b7f4cdbf2e0..1b4c43d0819a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -111,13 +111,29 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
- initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
+ /*
+ * As each CPU starts up, it will find its own stack pointer
+ * from its current_task->thread.sp. Typically that will be
+ * the idle thread for a newly-started AP, or even the boot
+ * CPU which will find it set to &init_task in the static
+ * per-cpu data.
+ *
+ * Make the resuming CPU use the temporary stack at startup
+ * by setting current->thread.sp to point to that. The true
+ * %rsp will be restored with the rest of the CPU context,
+ * by do_suspend_lowlevel(). And unwinders don't care about
+ * the abuse of ->thread.sp because it's a dead variable
+ * while the thread is running on the CPU anyway; the true
+ * value is in the actual %rsp register.
+ */
+ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_rw(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
+ smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0L;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */

/*
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 82c783da16a8..797ae1a15c91 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -108,6 +108,7 @@ static void __used common(void)
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+ OFFSET(X86_current_task, pcpu_hot, current_task);
#ifdef CONFIG_CALL_DEPTH_TRACKING
OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 222efd4a09bc..0b34f6a9221b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -61,8 +61,8 @@ SYM_CODE_START_NOALIGN(startup_64)
* tables and then reload them.
*/

- /* Set up the stack for verify_cpu(), similar to initial_stack below */
- leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
+ /* Set up the stack for verify_cpu() */
+ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp

leaq _text(%rip), %rdi

@@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_EMPTY
ANNOTATE_NOENDBR // above

+#ifdef CONFIG_SMP
+ movl smpboot_control(%rip), %ecx
+
+ /* Get the per cpu offset for the given CPU# which is in ECX */
+ movq __per_cpu_offset(,%rcx,8), %rdx
+#else
+ xorq %edx, %edx /* zero-extended to clear all of RDX */
+#endif /* CONFIG_SMP */
+
+ /*
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
+ * by now because the cr3-switch above unmaps the real-mode stack.
+ *
+ * RDX contains the per-cpu offset
+ */
+ movq pcpu_hot + X86_current_task(%rdx), %rax
+ movq TASK_threadsp(%rax), %rsp
+
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
@@ -275,12 +293,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl initial_gs+4(%rip),%edx
wrmsr

- /*
- * Setup a boot time stack - Any secondary CPU will have lost its stack
- * by now because the cr3-switch above unmaps the real-mode stack
- */
- movq initial_stack(%rip), %rsp
-
/* Setup and Load IDT */
pushq %rsi
call early_setup_idt
@@ -372,7 +384,11 @@ SYM_CODE_END(secondary_startup_64)
SYM_CODE_START(start_cpu0)
ANNOTATE_NOENDBR
UNWIND_HINT_EMPTY
- movq initial_stack(%rip), %rsp
+
+ /* Find the idle task stack */
+ movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq TASK_threadsp(%rcx), %rsp
+
jmp .Ljump_to_C_code
SYM_CODE_END(start_cpu0)
#endif
@@ -420,12 +436,6 @@ SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
-
-/*
- * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA

__INIT
@@ -660,6 +670,9 @@ SYM_DATA_END(level1_fixmap_pgt)
SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))

+ .align 16
+SYM_DATA(smpboot_control, .long 0)
+
.align 16
/* This must match the first entry in level2_kernel_pgt */
SYM_DATA(phys_base, .quad 0x0)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b18c1385e181..62e3bf37f0b8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1112,7 +1112,12 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
- initial_stack = idle->thread.sp;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ initial_stack = idle->thread.sp;
+ } else {
+ smpboot_control = cpu;
+ }

/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..6bd391476656 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
ANNOTATE_NOENDBR
cld

- mov initial_stack(%rip), %rsp
+ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp

/* Set up %gs.
*
--
2.39.0


Attachment: smime.p7s
Description: S/MIME cryptographic signature