Re: [PATCHv3 14/14] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method

From: kirill.shutemov@xxxxxxxxxxxxxxx
Date: Fri Dec 01 2023 - 10:57:50 EST


On Thu, Nov 23, 2023 at 09:38:13AM +0000, Huang, Kai wrote:
>
> > diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> > index 171d86fe71ef..602b5d3982ff 100644
> > --- a/arch/x86/kernel/acpi/boot.c
> > +++ b/arch/x86/kernel/acpi/boot.c
> > @@ -22,6 +22,7 @@
> > #include <linux/efi-bgrt.h>
> > #include <linux/serial_core.h>
> > #include <linux/pgtable.h>
> > +#include <linux/sched/hotplug.h>
> >
> > #include <asm/e820/api.h>
> > #include <asm/irqdomain.h>
> > @@ -33,6 +34,7 @@
> > #include <asm/smp.h>
> > #include <asm/i8259.h>
> > #include <asm/setup.h>
> > +#include <asm/init.h>
>
> The above two are leftovers I believe?
>
> [...]
>

Right.


> > +
> > +static atomic_t waiting_for_crash_ipi;
> > +
> > +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
> > +
> > +static void acpi_mp_play_dead(void)
> > +{
> > + play_dead_common();
> > + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> > + acpi_mp_pgd);
> > +}
> > +
> > +static void acpi_mp_cpu_die(unsigned int cpu)
> > +{
> > + u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
> > + unsigned long timeout;
> > +
> > + /*
> > + * Use TEST mailbox command to prove that BIOS got control over
> > + * the CPU before declaring it dead.
> > + *
> > + * BIOS has to clear 'command' field of the mailbox.
> > + */
> > + acpi_mp_wake_mailbox->apic_id = apicid;
> > + smp_store_release(&acpi_mp_wake_mailbox->command,
> > + ACPI_MP_WAKE_COMMAND_TEST);
> > +
> > + /* Don't wait longer than a second. */
> > + timeout = USEC_PER_SEC;
> > + while (READ_ONCE(acpi_mp_wake_mailbox->command) && timeout--)
> > + udelay(1);
> > +}
> > +
> > +static void acpi_mp_stop_other_cpus(int wait)
> > +{
> > + smp_shutdown_nonboot_cpus(smp_processor_id());
> > +}
> > +
> > +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
> > +{
> > + local_irq_disable();
> > +
> > + crash_save_cpu(regs, raw_smp_processor_id());
> > +
> > + cpu_emergency_stop_pt();
> > +
> > + disable_local_APIC();
> > +
> > + /*
> > + * Prepare the CPU for reboot _after_ invoking the callback so that the
> > + * callback can safely use virtualization instructions, e.g. VMCLEAR.
> > + */
> > + cpu_emergency_disable_virtualization();
> > +
> > + atomic_dec(&waiting_for_crash_ipi);
> > +
> > + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
> > + acpi_mp_pgd);
> > +
> > + return NMI_HANDLED;
> > +}
> > +
> > +static void acpi_mp_crash_stop_other_cpus(void)
> > +{
> > + unsigned long timeout;
> > +
> > + /* The kernel is broken so disable interrupts */
> > + local_irq_disable();
> > +
> > +
> > + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> > +
> > + /* Would it be better to replace the trap vector here? */
> > + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
> > + NMI_FLAG_FIRST, "crash"))
> > + return; /* Return what? */
> > +
> > + apic_send_IPI_allbutself(NMI_VECTOR);
> > +
> > + /* Don't wait longer than a second. */
> > + timeout = USEC_PER_SEC;
> > + while (atomic_read(&waiting_for_crash_ipi) && timeout--)
> > + udelay(1);
> > +}
> > +
> >
>
> [...]
>
> > + smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> > + smp_ops.crash_stop_other_cpus = acpi_mp_crash_stop_other_cpus;
> > +
> >
>
> The above acpi_mp_crash_stop_other_cpus() and crash_nmi_callback() etc are kinda
> duplicated code with the normal crash kexec() in reboot.c.
>
> I am not expert here but spent some time looking into the code, and it appears
> the main reason preventing us from reusing that code should be TDX guest doesn't
> play nicely with mwait/halt staff when putting the cpu to offline.  
>
> I am thinking if we skip/replace them with the asm_acpi_mp_play_dead(), we
> should be able to just reuse the existing smp_ops.stop_other_cpus() and
> smp_ops.crash_stop_other_cpus()?

Okay, fair enough. See fixup below.

> > +
> > int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
> > const unsigned long end)
> > {
> > struct acpi_madt_multiproc_wakeup *mp_wake;
> >
> > mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> > - if (BAD_MADT_ENTRY(mp_wake, end))
> > + if (!mp_wake)
> > + return -EINVAL;
>
> I think you can keep the BAD_MADT_ENTRY() check as a standard check, and ...

No. BAD_MADT_ENTRY() will fail if the struct version is V0 because the
size will be smaller than sizeof(struct acpi_madt_multiproc_wakeup).


diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4fab2ed454f3..3c8efba86d5c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -38,6 +38,7 @@ struct smp_ops {
int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
+ void (*crash_play_dead)(void);

void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 782fe8fd533c..a801f773f9f1 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -25,6 +25,12 @@ static u64 acpi_mp_reset_vector_paddr __ro_after_init;

void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);

+static void crash_acpi_mp_play_dead(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
+ acpi_mp_pgd);
+}
+
static void acpi_mp_play_dead(void)
{
play_dead_common();
@@ -58,57 +64,6 @@ static void acpi_mp_stop_other_cpus(int wait)
smp_shutdown_nonboot_cpus(smp_processor_id());
}

-#ifdef CONFIG_KEXEC_CORE
-static atomic_t waiting_for_crash_ipi;
-
-static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
-{
- local_irq_disable();
-
- crash_save_cpu(regs, raw_smp_processor_id());
-
- cpu_emergency_stop_pt();
-
- disable_local_APIC();
-
- /*
- * Prepare the CPU for reboot _after_ invoking the callback so that the
- * callback can safely use virtualization instructions, e.g. VMCLEAR.
- */
- cpu_emergency_disable_virtualization();
-
- atomic_dec(&waiting_for_crash_ipi);
-
- asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr,
- acpi_mp_pgd);
-
- return NMI_HANDLED;
-}
-
-static void acpi_mp_crash_stop_other_cpus(void)
-{
- unsigned long timeout;
-
- /* The kernel is broken so disable interrupts */
- local_irq_disable();
-
-
- atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-
- /* Would it be better to replace the trap vector here? */
- if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
- NMI_FLAG_FIRST, "crash"))
- return; /* Return what? */
-
- apic_send_IPI_allbutself(NMI_VECTOR);
-
- /* Don't wait longer than a second. */
- timeout = USEC_PER_SEC;
- while (atomic_read(&waiting_for_crash_ipi) && timeout--)
- udelay(1);
-}
-#endif
-
/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
static void __init *alloc_pgt_page(void *dummy)
{
@@ -277,11 +232,9 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
}

smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.crash_play_dead = crash_acpi_mp_play_dead;
smp_ops.cpu_die = acpi_mp_cpu_die;
smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
-#ifdef CONFIG_KEXEC_CORE
- smp_ops.crash_stop_other_cpus = acpi_mp_crash_stop_other_cpus;
-#endif

acpi_mp_reset_vector_paddr = reset_vector;
acpi_mp_pgd = __pa(pgd);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c81afffaa954..99e6ab552da0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -878,10 +878,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu_emergency_disable_virtualization();

atomic_dec(&waiting_for_crash_ipi);
- /* Assume hlt works */
- halt();
- for (;;)
- cpu_relax();
+
+ if (smp_ops.crash_play_dead) {
+ smp_ops.crash_play_dead();
+ } else {
+ halt();
+ for (;;)
+ cpu_relax();
+ }

return NMI_HANDLED;
}
--
Kiryl Shutsemau / Kirill A. Shutemov