Re: [PATCHv7 21/30] x86/acpi, x86/boot: Add multiprocessor wake-up support

From: Dave Hansen
Date: Fri Mar 18 2022 - 15:22:21 EST


I'm not sure what I attached to the last one, but it wasn't what I
intended to send. Here it is with the actual code changes.From 09c9c93bce052cc688bdf73f80bf0f805202fc09 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx>
Date: Fri, 18 Mar 2022 18:30:39 +0300
Subject: [PATCH] x86/acpi, x86/boot: Add multiprocessor wake-up support

* Rewrite changelog
* Fix too-wide ->apic_id vertical alignment
* Remove errant "Virtual address of ..." comment
* Rename timeout=>wake_tries_left
* Make wake_tries_left an int, not a u8
* Actually try to explain the timeout in a comment
* I'd still love to know what the experiment actually did and
what the lower bound on the timeout is. Oh well.
* Change smp_wmb()/WRITE_ONCE() back to Peterz's
smp_store_release() suggestion.
* Use smp_store_release() for =0 too
* add a pr_warn() for failures
* Add an endif #ifdef comment, and fix up CONFIG_X86_LOCAL_APIC
oddity.

--

Secondary CPU startup is currently performed with something called
the "INIT/SIPI protocol". This protocol requires assistance from
VMMs to boot guests. As should be a familiar story by now, that
support can not be provded to TDX guests because TDX VMMs are
not trusted by guests.

To remedy this situation a new[1] "Multiprocessor Wakeup Structure"
has been added to to an existing ACPI table (MADT). This structure
provides the physical address of a "mailbox". A write to the mailbox
then steers the secondary CPU to the boot code.

Add ACPI MADT wake structure parsing support and wake support. Use
this support to wake CPUs whenever it is present instead of INIT/SIPI.

While this structure can theoretically be used on 32-bit kernels,
there are no 32-bit TDX guest kernels. It has not been tested and
can not practically *be* tested on 32-bit. Make it 64-bit only.

1. Details about the new structure can be found in ACPI v6.4, in the
"Multiprocessor Wakeup Structure" section.

Co-developed-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Reviewed-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/apic.h | 5 ++
arch/x86/kernel/acpi/boot.c | 115 +++++++++++++++++++++++++++++++++++-
arch/x86/kernel/apic/apic.c | 10 ++++
3 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 48067af94678..3cef7f126876 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -488,6 +488,11 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
}

+#ifdef CONFIG_X86_64
+typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip);
+extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler);
+#endif
+
extern int default_apic_id_valid(u32 apicid);
extern int default_acpi_madt_oem_check(char *, char *);
extern void default_setup_apic_routing(void);
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 5b6d1a95776f..26dc95f212e9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
static bool acpi_support_online_capable;
#endif

+#ifdef CONFIG_X86_64
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr;
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+#endif
+
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -336,7 +343,82 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}

-#endif /*CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_X86_64
+static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+{
+ static physid_mask_t apic_id_wakemap = PHYSID_MASK_NONE;
+ int wake_tries_left;
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * According to the ACPI specification r6.4, section titled
+ * "Multiprocessor Wakeup Structure" the mailbox-based wakeup
+ * mechanism cannot be used more than once for the same CPU.
+ * Skip wakeups if they are attempted more than once.
+ */
+ if (physid_isset(apicid, apic_id_wakemap)) {
+ pr_err("CPU already awake (APIC ID %x), skipping wakeup\n",
+ apicid);
+ return -EINVAL;
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up. The CPU being woken up is essentially
+ * in a spin loop waiting to be woken up. It should not take long
+ * for it wake up and acknowlege by zeroing out ->command.
+ *
+ * 255 was found to work on well-behaving TDX guest systems. ACPI
+ * does not say how long to wait. Timeouts here are likely due to
+ * host-side problems.
+ */
+ wake_tries_left = 255;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --wake_tries_left)
+ cpu_relax();
+
+ if (!wake_tries_left) {
+ pr_warn("ACPI wakeup of APIC ID: 0x%lx timed out\n", apicid);
+
+ /* Reset ->command so mailbox can try to boot another CPU */
+ smp_store_release(&acpi_mp_wake_mailbox->command, 0);
+ return -EIO;
+ }
+
+ /*
+ * If the CPU wakeup process is successful, store the
+ * status in apic_id_wakemap to prevent re-wakeup
+ * requests.
+ */
+ physid_set(apicid, apic_id_wakemap);
+
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
@@ -1083,6 +1165,29 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
+
+#ifdef CONFIG_X86_64
+static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ return -ENODEV;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+ if (BAD_MADT_ENTRY(mp_wake, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
+
+ acpi_wake_cpu_handler_update(acpi_wakeup_cpu);
+
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_X86_IO_APIC
@@ -1278,6 +1383,14 @@ static void __init acpi_process_madt(void)

smp_found_config = 1;
}
+
+#ifdef CONFIG_X86_64
+ /*
+ * Parse MADT MP Wake entry.
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP,
+ acpi_parse_mp_wake, 1);
+#endif
}
if (error == -EINVAL) {
/*
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b70344bf6600..3c8f2c797a98 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2551,6 +2551,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
}
EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);

+#ifdef CONFIG_X86_64
+void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++)
+ (*drv)->wakeup_secondary_cpu_64 = handler;
+}
+#endif
+
/*
* Override the generic EOI implementation with an optimized version.
* Only called during early boot when only one CPU is active and with
--
2.34.0