Re: [PATCHv2 13/13] x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method

From: Thomas Gleixner
Date: Sun Oct 29 2023 - 13:31:45 EST


On Fri, Oct 20 2023 at 18:12, Kirill A. Shutemov wrote:

> MADT Multiprocessor Wakeup structure version 1 brings support of CPU
> offlining: BIOS provides a reset vector where the CPU has to jump to
> offline itself. The new TEST mailbox command can be used to test the CPU
> offlined successfully and BIOS has control over it.
>
> Add CPU offling support for ACPI MADT wakeup method by implementing
> custom cpu_die, play_dead and stop_other_cpus SMP operations.
>
> CPU offlining makes possible to hand over secondary CPUs over kexec, not

makes it possible

> limiting the second kernel with single CPU.

s/with/to/

> The change conforms to the approved ACPI spec change proposal. See the
> +SYM_FUNC_START(asm_acpi_mp_play_dead)
> + /* Load address of reset vector into RCX to jump when kernel is ready */
> + movq acpi_mp_reset_vector_paddr(%rip), %rcx
> +
> + /* Turn off global entries. Following CR3 write will flush them. */
> + movq %cr4, %rdx
> + andq $~(X86_CR4_PGE), %rdx
> + movq %rdx, %cr4
> +
> + /* Switch to identity mapping */
> + movq acpi_mp_pgd(%rip), %rax
> + movq %rax, %cr3

You can just make this function:

asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);

then you have the reset vector in RDI and the pgd in RSI and spare the
global variables.

>
> /* Physical address of the Multiprocessor Wakeup Structure mailbox */
> @@ -11,6 +16,150 @@ static u64 acpi_mp_wake_mailbox_paddr;
> /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
> static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
>
> +u64 acpi_mp_pgd;
> +u64 acpi_mp_reset_vector_paddr;

See above (static) and __ro_after_init please

> +
> +void asm_acpi_mp_play_dead(void);
> +
> +static void __init *alloc_pgt_page(void *context)
> +{

What's the purpose of the context argument?

> + return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> +}
> +
> +/*
> + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
> + * the same place as in the kernel page tables. The function switches to
> + * the identity mapping and has be present at the same spot in before and
> + * after transition.

Why does it need to be there after the CPU jumped to the reset vector?

> + */
> +static int __init init_transition_pgtable(pgd_t *pgd)
> +{
> + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> + unsigned long vaddr, paddr;
> + int result = -ENOMEM;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> +
> + vaddr = (unsigned long)asm_acpi_mp_play_dead;
> + pgd += pgd_index(vaddr);
> + if (!pgd_present(*pgd)) {
> + p4d = (p4d_t *)alloc_pgt_page(NULL);
> + if (!p4d)
> + goto err;

return -ENOMEM?

the error labels is pretty silly without an actual cleanup, right?

> + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
> + }
> + p4d = p4d_offset(pgd, vaddr);
> + if (!p4d_present(*p4d)) {
> + pud = (pud_t *)alloc_pgt_page(NULL);
> + if (!pud)
> + goto err;

Ditto. But what mops up the already allocated page above?
> + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
> +
> + return 0;
> +err:
> + return result;
> +}

Can you please move that function to the place where it is used?

> +
> +static void acpi_mp_play_dead(void)
> +{
> + play_dead_common();
> + asm_acpi_mp_play_dead();
> +}
> +
> +static void acpi_mp_cpu_die(unsigned int cpu)
> +{
> + int apicid = per_cpu(x86_cpu_to_apicid, cpu);

u32 apicid

> + unsigned long timeout;
> +
> + /*
> + * Use TEST mailbox command to prove that BIOS got control over
> + * the CPU before declaring it dead.
> + *
> + * BIOS has to clear 'command' field of the mailbox.
> + */
> + acpi_mp_wake_mailbox->apic_id = apicid;
> + smp_store_release(&acpi_mp_wake_mailbox->command,
> + ACPI_MP_WAKE_COMMAND_TEST);
> +
> + /* Don't wait longer than a second. */
> + timeout = USEC_PER_SEC;
> + while (READ_ONCE(acpi_mp_wake_mailbox->command) && timeout--)
> + udelay(1);
> +}
> +
> +static void acpi_mp_stop_other_cpus(int wait)
> +{
> + smp_shutdown_nonboot_cpus(smp_processor_id());

This clearly was never tested with lockdep. At the point where
stop_other_cpus() is invoked the invoking CPU has interrupts disabled...

> +}
> +
> +static void acpi_mp_crash_stop_other_cpus(void)
> +{
> + smp_shutdown_nonboot_cpus(smp_processor_id());

Yuck. Crash can happen at arbitrary places. So you really cannot invoke
the whole CPU hotplug state machine from here.

There is a reason why the other implementation just kick CPUs into some
"safe" state.

> + /* The kernel is broken so disable interrupts */
> + local_irq_disable();
> +}
> +
> +static int __init acpi_mp_setup_reset(u64 reset_vector)
> +{
> + pgd_t *pgd;
> + struct x86_mapping_info info = {
> + .alloc_pgt_page = alloc_pgt_page,
> + .page_flag = __PAGE_KERNEL_LARGE_EXEC,
> + .kernpg_flag = _KERNPG_TABLE_NOENC,
> + };
> +
> + pgd = alloc_pgt_page(NULL);
> +
> + for (int i = 0; i < nr_pfn_mapped; i++) {
> + unsigned long mstart, mend;

Missing newline

> + mstart = pfn_mapped[i].start << PAGE_SHIFT;
> + mend = pfn_mapped[i].end << PAGE_SHIFT;
> + if (kernel_ident_mapping_init(&info, pgd, mstart, mend))
> + return -ENOMEM;
> + }
> +
> + if (kernel_ident_mapping_init(&info, pgd,
> + PAGE_ALIGN_DOWN(reset_vector),
> + PAGE_ALIGN(reset_vector + 1))) {
> + return -ENOMEM;
> + }
> +
> + if (init_transition_pgtable(pgd))
> + return -ENOMEM;
> +
> + smp_ops.play_dead = acpi_mp_play_dead;
> + smp_ops.cpu_die = acpi_mp_cpu_die;
> + smp_ops.stop_other_cpus = acpi_mp_stop_other_cpus;
> + smp_ops.crash_stop_other_cpus = acpi_mp_crash_stop_other_cpus;
> +
> + acpi_mp_reset_vector_paddr = reset_vector;
> + acpi_mp_pgd = __pa(pgd);
> +
> + return 0;
> +}
> +
> static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
> {
> if (!acpi_mp_wake_mailbox_paddr) {
> @@ -74,31 +223,43 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
> struct acpi_madt_multiproc_wakeup *mp_wake;
>
> mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> - if (BAD_MADT_ENTRY(mp_wake, end))
> + if (!mp_wake)
> + return -EINVAL;
> +
> + if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> + return -EINVAL;
> + if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> return -EINVAL;
>
> acpi_table_print_madt_entry(&header->common);
>
> acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
>
> - cpu_hotplug_disable_offlining();
> + if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
> + mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
> + acpi_mp_setup_reset(mp_wake->reset_vector);
> + } else {
> + cpu_hotplug_disable_offlining();
>
> - /*
> - * ACPI MADT doesn't allow to offline CPU after it got woke up.
> - * It limits kexec: the second kernel won't be able to use more than
> - * one CPU.
> - *
> - * Now acpi_mp_wake_mailbox_paddr already has the mailbox address.
> - * The acpi_wakeup_cpu() will use it to bring up secondary cpus.
> - *
> - * Zero out mailbox address in the ACPI MADT wakeup structure to
> - * indicate that the mailbox is not usable. This prevents the
> - * kexec()-ed kernel from reading a vaild mailbox, which in turn
> - * makes the kexec()-ed kernel only be able to use the boot CPU.
> - *
> - * This is Linux-specific protocol and not reflected in ACPI spec.
> - */
> - mp_wake->mailbox_address = 0;
> + /*
> + * ACPI MADT doesn't allow to offline CPU after it got woke up.

This is not longer accurate as V1 allows that ....

> + * It limits kexec: the second kernel won't be able to use more
> + * than one CPU.

Thanks,

tglx