Re: [RFC PATCH v3 4/8] arm64: Handle early CPU boot failures

From: Will Deacon
Date: Tue Dec 15 2015 - 06:55:21 EST


On Wed, Dec 09, 2015 at 09:57:15AM +0000, Suzuki K. Poulose wrote:
> A secondary CPU could fail to come online due to insufficient
> capabilities and could simply die or loop in the kernel.
> e.g, a CPU with no support for the selected kernel PAGE_SIZE
> loops in kernel with MMU turned off.
> or a hotplugged CPU which doesn't have one of the advertised
> system capability will die during the activation.
>
> There is no way to synchronise the status of the failing CPU
> back to the master. This patch solves the issue by adding a
> field to the secondary_data which can be updated by the failing
> CPU.
>
> Here are the possible states :
>
> -1. CPU_WAIT_STATUS - Initial value set by the master CPU.
>
> 0. CPU_BOOT_SUCCESS - CPU has booted successfully.
>
> 1. CPU_KILL_ME - CPU has invoked cpu_ops->die, indicating the
> master CPU to synchronise by issuing a cpu_ops->cpu_kill.
>
> 2. CPU_STUCK_IN_KERNEL - CPU couldn't invoke die(), instead is
> looping in the kernel. This information could be used by say,
> kexec to check if it is really safe to do a kexec reboot.
>
> 3. CPU_PANIC_KERNEL - CPU detected some serious issues which
> requires kernel to crash immediately. The secondary CPU cannot
> call panic() until it has initialised the GIC. This flag can
> be used to instruct the master to do so.
>
> Cc: Will Deacon <will.deacon@xxxxxxx>
> Cc: Mark Rutland <mark.rutland@xxxxxxx>
> Cc: Catalin Marinas <catalin.marinas@xxxxxxx>
> Signed-off-by: Suzuki K. Poulose <suzuki.poulose@xxxxxxx>
> ---
> arch/arm64/include/asm/smp.h | 24 +++++++++++++++++++-
> arch/arm64/kernel/asm-offsets.c | 3 +++
> arch/arm64/kernel/head.S | 19 ++++++++++++++--
> arch/arm64/kernel/smp.c | 48 ++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 90 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index 13ce01f..240ab3d 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -16,6 +16,19 @@
> #ifndef __ASM_SMP_H
> #define __ASM_SMP_H
>
> +/* Values for secondary_data.status */
> +
> +#define CPU_WAIT_RESULT -1
> +#define CPU_BOOT_SUCCESS 0
> +/* The cpu invoked ops->cpu_die, synchronise it with cpu_kill */
> +#define CPU_KILL_ME 1
> +/* The cpu couldn't die gracefully and is looping in the kernel */
> +#define CPU_STUCK_IN_KERNEL 2
> +/* Fatal system error detected by secondary CPU, crash the system */
> +#define CPU_PANIC_KERNEL 3
> +
> +#ifndef __ASSEMBLY__
> +
> #include <linux/threads.h>
> #include <linux/cpumask.h>
> #include <linux/thread_info.h>
> @@ -54,10 +67,15 @@ asmlinkage void secondary_start_kernel(void);
>
> /*
> * Initial data for bringing up a secondary CPU.
> + * @stack - sp for the secondary CPU
> + * @status - Result passed back from the secondary CPU to
> + * indicate failure.
> */
> struct secondary_data {
> void *stack;
> -};
> + unsigned long status;
> +} ____cacheline_aligned;

Why is this necessary?

> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> index b225d34..e0a42dd 100644
> --- a/arch/arm64/kernel/head.S
> +++ b/arch/arm64/kernel/head.S
> @@ -34,6 +34,7 @@
> #include <asm/pgtable-hwdef.h>
> #include <asm/pgtable.h>
> #include <asm/page.h>
> +#include <asm/smp.h>
> #include <asm/sysreg.h>
> #include <asm/thread_info.h>
> #include <asm/virt.h>
> @@ -605,7 +606,7 @@ ENTRY(secondary_startup)
> ENDPROC(secondary_startup)
>
> ENTRY(__secondary_switched)
> - ldr x0, [x22] // get secondary_data.stack
> + ldr x0, [x22, #CPU_BOOT_STACK] // get secondary_data.stack
> mov sp, x0
> mov x29, #0
> b secondary_start_kernel
> @@ -615,6 +616,7 @@ ENDPROC(__secondary_switched)
> * Enable the MMU.
> *
> * x0 = SCTLR_EL1 value for turning on the MMU.
> + * x22 = __va(secondary_data)
> * x27 = *virtual* address to jump to upon completion
> *
> * Other registers depend on the function called upon completion.
> @@ -647,6 +649,19 @@ __enable_mmu:
> ENDPROC(__enable_mmu)
>
> __no_granule_support:
> + /* Indicate that this CPU can't boot and is stuck in the kernel */
> + cmp x22, 0 // Boot CPU doesn't update status
> + b.eq 1f
> + adrp x1, secondary_data
> + add x1, x1, #:lo12:secondary_data // x1 = __pa(secondary_data)

This feels a bit grotty. You end up using the virtual address of
secondary_data to figure out if you're the boot CPU or not, and then you
end up having to compute the physical address of the same variable anyway.

> + mov x0, #CPU_STUCK_IN_KERNEL
> + str x0, [x1, #CPU_BOOT_STATUS] // update the secondary_data.status
> + /* flush the data to PoC */

Can you call __inval_cache_range instead of open-coding this?

> + isb

Why?

> + dc civac, x1
> + dsb sy
> + isb

Why?

> +1:
> wfe

Curious, but do you know why we don't have a wfi here too (like we do in
the C code)?

> - b __no_granule_support
> + b 1b
> ENDPROC(__no_granule_support)
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 607d876..708f4b1 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -63,6 +63,8 @@
> * where to place its SVC stack
> */
> struct secondary_data secondary_data;
> +/* Number of CPUs which aren't online, but looping in kernel text. */
> +u32 cpus_stuck_in_kernel;
>
> enum ipi_msg_type {
> IPI_RESCHEDULE,
> @@ -72,6 +74,16 @@ enum ipi_msg_type {
> IPI_IRQ_WORK,
> };
>
> +#ifdef CONFIG_HOTPLUG_CPU
> +static int op_cpu_kill(unsigned int cpu);
> +#else
> +static inline int op_cpu_kill(unsigned int cpu)
> +{
> + return -ENOSYS;
> +}
> +#endif
> +
> +
> /*
> * Boot a secondary CPU, and assign it the specified idle task.
> * This also gives us the initial stack to use for this CPU.
> @@ -86,6 +98,12 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle)
>
> static DECLARE_COMPLETION(cpu_running);
>
> +void update_cpu_boot_status(unsigned long status)
> +{
> + secondary_data.status = status;
> + __flush_dcache_area(&secondary_data, sizeof(secondary_data));
> +}
> +
> int __cpu_up(unsigned int cpu, struct task_struct *idle)
> {
> int ret;
> @@ -95,7 +113,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
> * page tables.
> */
> secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
> - __flush_dcache_area(&secondary_data, sizeof(secondary_data));
> + update_cpu_boot_status(CPU_WAIT_RESULT);
>
> /*
> * Now bring the CPU into our world.
> @@ -117,7 +135,31 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
> pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
> }
>
> + mb();

Why? (and why not smp_mb(), if the barrier is actually needed?).

> +
> secondary_data.stack = NULL;
> + if (ret && secondary_data.status) {
> + switch(secondary_data.status) {
> + default:
> + pr_err("CPU%u: failed in unknown state : 0x%lx\n",
> + cpu, secondary_data.status);
> + break;
> + case CPU_KILL_ME:
> + if (op_cpu_kill(cpu)) {
> + pr_crit("CPU%u: may not have shut down cleanly\n",
> + cpu);
> + cpus_stuck_in_kernel++;

Maybe restructure this so the failed kill case is a fallthrough to the
CPU_STUCK_IN_KERNEL case?

> + } else
> + pr_crit("CPU%u: died during early boot\n", cpu);

Missing braces.

> + break;
> + case CPU_STUCK_IN_KERNEL:
> + pr_crit("CPU%u: is stuck in kernel\n", cpu);
> + cpus_stuck_in_kernel++;
> + break;
> + case CPU_PANIC_KERNEL:
> + panic("CPU%u detected unsupported configuration\n", cpu);

It would nice to show what the configuration difference was, but maybe
that's work for later.

> + }
> + }
>
> return ret;
> }
> @@ -185,6 +227,7 @@ asmlinkage void secondary_start_kernel(void)
> */
> pr_info("CPU%u: Booted secondary processor [%08x]\n",
> cpu, read_cpuid_id());
> + update_cpu_boot_status(CPU_BOOT_SUCCESS);

Why do we need to continue with the cacheflushing at this point?

> set_cpu_online(cpu, true);
> complete(&cpu_running);
>
> @@ -327,10 +370,13 @@ void cpu_die_early(void)
> set_cpu_present(cpu, 0);
>
> #ifdef CONFIG_HOTPLUG_CPU
> + update_cpu_boot_status(CPU_KILL_ME);
> /* Check if we can park ourselves */
> if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_die)
> cpu_ops[cpu]->cpu_die(cpu);
> #endif
> + update_cpu_boot_status(CPU_STUCK_IN_KERNEL);
> + __flush_dcache_area(&secondary_data, sizeof(secondary_data));

Extra flushing, but I don't see why you need it at all when you're
running in C on the CPU coming up.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/