Re: [PATCH 3/6] x86/smpboot.c: Wake up offline CPU via mwait or nmi

From: Suresh Siddha
Date: Mon Jun 04 2012 - 14:59:44 EST


On Mon, 2012-06-04 at 11:17 -0700, Fenghua Yu wrote:
> From: Fenghua Yu <fenghua.yu@xxxxxxxxx>
>
> wakeup_secondary_cpu_via_soft() is defined to wake up offline CPU via mwait if
> the CPU is in mwait or via nmi if the CPU is in hlt.
>
> A CPU boots up by INIT, INIT, STARTUP sequence when it boots up for the first
> time during boot time or hot plug.

I think this breaks suspend/resume as the cpu state gets lost.

Have you tried suspend/resume?

>
> Signed-off-by: Fenghua Yu <fenghua.yu@xxxxxxxxx>
> ---
> arch/x86/include/asm/apic.h | 5 +-
> arch/x86/kernel/smpboot.c | 187 ++++++++++++++++++++++++++++++++++++------
> 2 files changed, 164 insertions(+), 28 deletions(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index eaff479..cad00b1 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -425,7 +425,10 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];
> #ifdef CONFIG_SMP
> extern atomic_t init_deasserted;
> extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
> -#endif
> +extern int wakeup_secondary_cpu_via_soft(int apicid, unsigned long start_eip);
> +#else /* CONFIG_SMP */
> +#define wakeup_secondary_cpu_via_soft NULL
> +#endif /* CONFIG_SMP */
>
> #ifdef CONFIG_X86_LOCAL_APIC
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index fd019d7..109df30 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -472,13 +472,8 @@ void __inquire_remote_apic(int apicid)
> }
> }
>
> -/*
> - * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
> - * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
> - * won't ... remember to clear down the APIC, etc later.
> - */
> -int __cpuinit
> -wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
> +static int __cpuinit
> +_wakeup_secondary_cpu_via_nmi(int apicid, int dest_mode)
> {
> unsigned long send_status, accept_status = 0;
> int maxlvt;
> @@ -486,7 +481,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
> /* Target chip */
> /* Boot on the stack */
> /* Kick the second */
> - apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
> + apic_icr_write(APIC_DM_NMI | dest_mode, apicid);
>
> pr_debug("Waiting for send to finish...\n");
> send_status = safe_apic_wait_icr_idle();
> @@ -511,6 +506,47 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
> return (send_status | accept_status);
> }
>
> +/*
> + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
> + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
> + * won't ... remember to clear down the APIC, etc later.
> + */
> +int __cpuinit
> +wakeup_secondary_cpu_via_nmi_phys(int phys_apicid, unsigned long start_eip)
> +{
> + return _wakeup_secondary_cpu_via_nmi(phys_apicid, APIC_DEST_PHYSICAL);
> +}
> +
> +int __cpuinit
> +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
> +{
> + return _wakeup_secondary_cpu_via_nmi(logical_apicid, APIC_DEST_LOGICAL);
> +}
> +
> +DEFINE_PER_CPU(int, cpu_dead) = { 0 };
> +#define CPU_DEAD_TRIGGER 1
> +#define CPU_DEAD_MWAIT 2
> +#define CPU_DEAD_HLT 4
> +
> +static int wakeup_secondary_cpu_via_mwait(int cpu)
> +{
> + per_cpu(cpu_dead, cpu) |= CPU_DEAD_TRIGGER;
> + return 0;
> +}
> +
> +static int wakeup_cpu_nmi(unsigned int cmd, struct pt_regs *regs)
> +{
> + int cpu = smp_processor_id();
> + int *cpu_dead_ptr;
> +
> + cpu_dead_ptr = &per_cpu(cpu_dead, cpu);
> + if (!cpu_online(cpu) && (*cpu_dead_ptr & CPU_DEAD_HLT) &&
> + (*cpu_dead_ptr & CPU_DEAD_TRIGGER))
> + return NMI_HANDLED;
> +
> + return NMI_DONE;
> +}
> +
> static int __cpuinit
> wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
> {
> @@ -626,6 +662,52 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
> return (send_status | accept_status);
> }
>
> +/*
> + * Kick a cpu.
> + *
> + * If the CPU is in mwait, wake it up by mwait method. Otherwise, if the CPU is
> + * in halt, wake it up by NMI. If none of above exists, wake it up by INIT boot
> + * APIC message.
> + *
> + * When the CPU first time boots up, i.e. cpu_dead is 0, it's waken up by INIT
> + * boot APIC message.
> + *
> + * At this point, the CPU should be in a fixed dead state. So we don't consider
> + * racy condition here.
> + */
> +int __cpuinit
> +wakeup_secondary_cpu_via_soft(int apicid, unsigned long start_eip)
> +{
> + int cpu;
> + int boot_error = 0;
> + /* start_ip had better be page-aligned! */
> + unsigned long start_ip = real_mode_header->trampoline_start;
> +
> + for (cpu = 0; cpu < nr_cpu_ids; cpu++)
> + if (apicid == apic->cpu_present_to_apicid(cpu))
> + break;
> +
> + if (cpu >= nr_cpu_ids)
> + return -EINVAL;
> +
> + if (per_cpu(cpu_dead, cpu) & CPU_DEAD_MWAIT) {
> + boot_error = wakeup_secondary_cpu_via_mwait(cpu);
> + } else if (per_cpu(cpu_dead, cpu) & CPU_DEAD_HLT) {
> + int *cpu_dead_ptr;
> +
> + cpu_dead_ptr = &per_cpu(cpu_dead, cpu);
> + *cpu_dead_ptr |= CPU_DEAD_TRIGGER;
> +
> + boot_error = wakeup_secondary_cpu_via_nmi_phys(apicid,
> + start_ip);
> + if (boot_error)
> + *cpu_dead_ptr &= ~CPU_DEAD_TRIGGER;
> + } else
> + boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
> +
> + return boot_error;
> +}
> +
> /* reduce the number of lines printed when booting a large cpu count system */
> static void __cpuinit announce_cpu(int cpu, int apicid)
> {
> @@ -778,6 +860,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
> */
> smpboot_restore_warm_reset_vector();
> }
> +
> return boot_error;
> }
>
> @@ -977,6 +1060,20 @@ static void __init smp_cpu_index_default(void)
> }
> }
>
> +static bool mwait_supported(void)
> +{
> + struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
> +
> + if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
> + return false;
> + if (!this_cpu_has(X86_FEATURE_CLFLSH))
> + return false;
> + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> + return false;
> +
> + return true;
> +}
> +
> /*
> * Prepare for SMP bootup. The MP table or ACPI has been read
> * earlier. Just do some sanity checking here and enable APIC mode.
> @@ -1051,6 +1148,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
> uv_system_init();
>
> set_mtrr_aps_delayed_init();
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> + if (!mwait_supported())
> + register_nmi_handler(NMI_LOCAL, wakeup_cpu_nmi, 0, "wake_cpu");
> +#endif
> out:
> preempt_enable();
> }
> @@ -1111,6 +1213,12 @@ static int __init _setup_possible_cpus(char *str)
> }
> early_param("possible_cpus", _setup_possible_cpus);
>
> +static int __init setup_wakeup_cpu_via_init(char *str)
> +{
> + apic->wakeup_secondary_cpu = NULL;
> + return 0;
> +}
> +__setup("wakeup_cpu_via_init", setup_wakeup_cpu_via_init);
>
> /*
> * cpu_possible_mask should be static, it cannot change as cpu's
> @@ -1286,6 +1394,28 @@ void play_dead_common(void)
> local_irq_disable();
> }
>
> +static bool wakeup_cpu(int *trigger)
> +{
> + unsigned int timeout;
> +
> + /*
> + * Wait up to 1 seconds to check if CPU wakeup trigger is set in
> + * cpu_dead by either memory write or NMI.
> + * If there is no CPU wakeup trigger, go back to sleep.
> + */
> + for (timeout = 0; timeout < 1000000; timeout++) {
> + /*
> + * Check if CPU0 wakeup NMI is issued and handled.
> + */
> + if (*trigger & CPU_DEAD_TRIGGER)
> + return true;
> +
> + udelay(1);
> + }
> +
> + return false;
> +}
> +
> /*
> * We need to flush the caches before going to sleep, lest we have
> * dirty data in our caches when we come back up.
> @@ -1296,14 +1426,9 @@ static inline void mwait_play_dead(void)
> unsigned int highest_cstate = 0;
> unsigned int highest_subcstate = 0;
> int i;
> - void *mwait_ptr;
> - struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
> + int *cpu_dead_ptr;
>
> - if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
> - return;
> - if (!this_cpu_has(X86_FEATURE_CLFLSH))
> - return;
> - if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> + if (!mwait_supported())
> return;
>
> eax = CPUID_MWAIT_LEAF;
> @@ -1328,16 +1453,10 @@ static inline void mwait_play_dead(void)
> (highest_subcstate - 1);
> }
>
> - /*
> - * This should be a memory location in a cache line which is
> - * unlikely to be touched by other processors. The actual
> - * content is immaterial as it is not actually modified in any way.
> - */
> - mwait_ptr = &current_thread_info()->flags;
> -
> - wbinvd();
> -
> + cpu_dead_ptr = &per_cpu(cpu_dead, smp_processor_id());
> + *cpu_dead_ptr = CPU_DEAD_MWAIT;
> while (1) {
> + *cpu_dead_ptr &= ~CPU_DEAD_TRIGGER;
> /*
> * The CLFLUSH is a workaround for erratum AAI65 for
> * the Xeon 7400 series. It's not clear it is actually
> @@ -1345,20 +1464,34 @@ static inline void mwait_play_dead(void)
> * The WBINVD is insufficient due to the spurious-wakeup
> * case where we return around the loop.
> */
> - clflush(mwait_ptr);
> - __monitor(mwait_ptr, 0, 0);
> + wbinvd();
> + clflush(cpu_dead_ptr);
> + __monitor(cpu_dead_ptr, 0, 0);
> mb();
> - __mwait(eax, 0);
> + if ((*cpu_dead_ptr & CPU_DEAD_TRIGGER) == 0)
> + __mwait(eax, 0);
> +
> + /* Waken up by another CPU. */
> + if (wakeup_cpu(cpu_dead_ptr))
> + start_cpu();
> }
> }
>
> static inline void hlt_play_dead(void)
> {
> + int *cpu_dead_ptr;
> +
> if (__this_cpu_read(cpu_info.x86) >= 4)
> wbinvd();
>
> + cpu_dead_ptr = &per_cpu(cpu_dead, smp_processor_id());
> + *cpu_dead_ptr = CPU_DEAD_HLT;
> while (1) {
> + *cpu_dead_ptr &= ~CPU_DEAD_TRIGGER;
> native_halt();
> + /* If NMI wants to wake up me, I'll start. */
> + if (wakeup_cpu(cpu_dead_ptr))
> + start_cpu();
> }
> }
>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/