[PATCH] arm64: smp: Add support for cpu park

From: Sang Yan
Date: Sat Dec 19 2020 - 01:14:21 EST


Introducing a feature of CPU PARK in order to save time
of cpus down and up during kexec, which may cost 250ms of
per cpu's down and 30ms of up.

As a result, for 128 cores, it costs more than 30 seconds
to down and up cpus during kexec. Think about 256 cores and more.

CPU PARK is a state that cpu power-on and staying in spin loop, polling
for exit chances, such as writing exit address.

Reserving a block of memory, to fill with cpu park text section,
exit address and park-magic-flag of each cpu. In implementation,
reserved one page for one cpu core.

Cpus going to park state instead of down in machine_shutdown().
Cpus going out of park state in smp_init instead of brought up.

One of cpu park sections in pre-reserved memory blocks,:
+--------------+
+ exit address +
+--------------+
+ park magic +
+--------------+
+ park codes +
+ . +
+ . +
+ . +
+--------------+

Signed-off-by: Sang Yan <sangyan@xxxxxxxxxx>
---
arch/arm64/Kconfig | 12 +++
arch/arm64/include/asm/cpu.h | 9 ++
arch/arm64/include/asm/kexec.h | 7 ++
arch/arm64/include/asm/smp.h | 3 +
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/cpu-park.S | 49 +++++++++
arch/arm64/kernel/machine_kexec.c | 2 +-
arch/arm64/kernel/process.c | 4 +
arch/arm64/kernel/smp.c | 220 ++++++++++++++++++++++++++++++++++++++
arch/arm64/mm/init.c | 56 ++++++++++
10 files changed, 362 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/kernel/cpu-park.S

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f0139b..7a9defd 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -347,6 +347,18 @@ config KASAN_SHADOW_OFFSET
default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
default 0xffffffffffffffff

+config ARM64_CPU_PARK
+ bool "Support CPU PARK on kexec"
+ depends on SMP
+ depends on KEXEC_CORE
+ help
+ This enables support for CPU PARK feature in
+ order to save time of cpu down to up.
+ CPU park is a state through kexec, spin loop
+ instead of cpu die before jumping to new kernel,
+ jumping out from loop to new kernel entry in
+ smp_init.
+
source "arch/arm64/Kconfig.platforms"

menu "Kernel Features"
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 7faae6f..e616a50 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -68,4 +68,13 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info);
void update_cpu_features(int cpu, struct cpuinfo_arm64 *info,
struct cpuinfo_arm64 *boot);

+#ifdef CONFIG_ARM64_CPU_PARK
+#define PARK_SECTION_SIZE PAGE_SIZE
+extern unsigned long park_start;
+extern unsigned long park_len;
+extern unsigned long park_start_v;
+extern void __cpu_park(unsigned long text, unsigned long exit);
+extern void __do_cpu_park(unsigned long exit);
+#endif
+
#endif /* __ASM_CPU_H */
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index d24b527..69a66ca 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -25,6 +25,11 @@

#define KEXEC_ARCH KEXEC_ARCH_AARCH64

+#ifdef CONFIG_ARM64_CPU_PARK
+/* CPU park state flag: "park" */
+#define PARK_MAGIC 0x7061726b
+#endif
+
#ifndef __ASSEMBLY__

/**
@@ -90,6 +95,8 @@ static inline void crash_prepare_suspend(void) {}
static inline void crash_post_resume(void) {}
#endif

+void machine_kexec_mask_interrupts(void);
+
#ifdef CONFIG_KEXEC_FILE
#define ARCH_HAS_KIMAGE_ARCH

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2e7f529..9141fa8 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -145,6 +145,9 @@ bool cpus_are_stuck_in_kernel(void);

extern void crash_smp_send_stop(void);
extern bool smp_crash_stop_failed(void);
+#ifdef CONFIG_ARM64_CPU_PARK
+extern int kexec_smp_send_park(void);
+#endif

#endif /* ifndef __ASSEMBLY__ */

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 86364ab..7ea26ab 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \
cpu-reset.o
+obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o
arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S
new file mode 100644
index 00000000..8c01484
--- /dev/null
+++ b/arch/arm64/kernel/cpu-park.S
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CPU park routines
+ *
+ * Copyright (C) 2020 Huawei Technologies., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+.text
+.pushsection .idmap.text, "awx"
+
+/* cpu park helper in idmap section */
+SYM_CODE_START(__cpu_park)
+ /* Clear sctlr_el1 flags. */
+ mrs x12, sctlr_el1
+ ldr x13, =SCTLR_ELx_FLAGS
+ bic x12, x12, x13
+ pre_disable_mmu_workaround
+ msr sctlr_el1, x12 /* disable mmu */
+ isb
+
+ mov x18, x0
+ mov x0, x1 /* secondary_entry addr */
+ br x18 /* call __do_cpu_park of each cpu */
+SYM_CODE_END(__cpu_park)
+
+.popsection
+
+SYM_CODE_START(__do_cpu_park)
+ ldr x18, =PARK_MAGIC /* magic number "park" */
+ str x18, [x0, #8] /* set on-park flag */
+
+.Lloop: ldr x19, [x0]
+ cmp x19, #0 /* test secondary_entry */
+ wfe
+ b.eq .Lloop
+
+ br x19 /* jump to secondary_entry */
+SYM_CODE_END(__do_cpu_park)
+
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144c..f47ce96 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -213,7 +213,7 @@ void machine_kexec(struct kimage *kimage)
BUG(); /* Should never get here. */
}

-static void machine_kexec_mask_interrupts(void)
+void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6616486..70d8dac 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -147,6 +147,10 @@ void arch_cpu_idle_dead(void)
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (kexec_smp_send_park() == 0)
+ return;
+#endif
smp_shutdown_nonboot_cpus(reboot_cpu);
}

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 2499b89..80a7a28 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -93,6 +93,159 @@ static inline int op_cpu_kill(unsigned int cpu)
}
#endif

+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_section {
+ unsigned long exit; /* exit address of park look */
+ unsigned long magic; /* maigc represent park state */
+ char text[0]; /* text section of park */
+};
+
+static int mmap_cpu_park_mem(void)
+{
+ if (!park_start)
+ return -ENOMEM;
+
+ if (park_start_v)
+ return 0;
+
+ park_start_v = (unsigned long)__ioremap(park_start, park_len,
+ PAGE_KERNEL_EXEC);
+ if (!park_start_v) {
+ pr_warn("map park memory failed.");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static inline unsigned long cpu_park_section_v(unsigned int cpu)
+{
+ return PAGE_ALIGN(park_start_v + PARK_SECTION_SIZE * (cpu - 1));
+}
+
+static inline unsigned long cpu_park_section_p(unsigned int cpu)
+{
+ return PAGE_ALIGN(park_start + PARK_SECTION_SIZE * (cpu - 1));
+}
+
+/*
+ * Write the secondary_entry to exit section of park state.
+ * Then the secondary cpu will jump straight into the kernel
+ * by the secondary_entry.
+ */
+static int write_park_exit(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_text;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_exit = &park_section->exit;
+ park_text = (unsigned long *)park_section->text;
+ pr_info("park_text 0x%lx : 0x%lx, __do_cpu_park text 0x%lx : 0x%lx",
+ (unsigned long)park_text, *park_text,
+ (unsigned long)__do_cpu_park,
+ *(unsigned long *)__do_cpu_park);
+
+ isb();
+
+ /*
+ * Test first 8 bytes to determine
+ * whether needs to write cpu park exit.
+ */
+ if (*park_text == *(unsigned long *)__do_cpu_park) {
+ *park_exit = __pa_symbol(secondary_entry);
+ isb();
+ pr_info("Write cpu %u secondary entry 0x%lx to 0x%lx",
+ cpu, *park_exit, (unsigned long)park_exit);
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+/* Install cpu park sections for the specific cpu. */
+static int install_cpu_park(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_magic;
+ unsigned long park_text_len;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx",
+ cpu, (unsigned long)park_section,
+ (unsigned long)(park_section->text));
+
+ park_exit = &park_section->exit;
+ park_magic = &park_section->magic;
+ park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section);
+
+ *park_exit = 0UL;
+ *park_magic = 0UL;
+ memcpy((void *)park_section->text, __do_cpu_park, park_text_len);
+
+ isb();
+
+ return 0;
+}
+
+static int uninstall_cpu_park(unsigned int cpu)
+{
+ unsigned long park_section;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = cpu_park_section_v(cpu);
+ memset((void *)park_section, 0, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int cpu_wait_park(unsigned int cpu)
+{
+ unsigned long timeout;
+ struct cpu_park_section *park_section;
+ unsigned long *park_magic;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_magic = &park_section->magic;
+
+ timeout = USEC_PER_SEC;
+ while (*park_magic != PARK_MAGIC && timeout--)
+ udelay(1);
+
+ if (timeout)
+ pr_debug("cpu %u park done.", cpu);
+ else
+ pr_err("cpu %u park failed.", cpu);
+
+ return *park_magic == PARK_MAGIC;
+}
+
+static void cpu_park(unsigned int cpu)
+{
+ unsigned long park_section_p;
+ unsigned long park_exit_phy;
+ unsigned long do_park;
+ typeof(__cpu_park) *park;
+
+ park_section_p = cpu_park_section_p(cpu);
+ park_exit_phy = park_section_p;
+ pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy);
+
+ do_park = park_section_p + sizeof(struct cpu_park_section);
+ park = (void *)__pa_symbol(__cpu_park);
+
+ cpu_install_idmap();
+ park(do_park, park_exit_phy);
+ unreachable();
+}
+#endif

/*
* Boot a secondary CPU, and assign it the specified idle task.
@@ -102,6 +255,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle)
{
const struct cpu_operations *ops = get_cpu_ops(cpu);

+#ifdef CONFIG_ARM64_CPU_PARK
+ if (write_park_exit(cpu) == 0)
+ return 0;
+#endif
if (ops->cpu_boot)
return ops->cpu_boot(cpu);

@@ -131,6 +288,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
return ret;
}

+#ifdef CONFIG_ARM64_CPU_PARK
+ uninstall_cpu_park(cpu);
+#endif
/*
* CPU was successfully started, wait for it to come online or
* time out.
@@ -843,10 +1003,31 @@ void arch_irq_work_raise(void)

static void local_cpu_stop(void)
{
+ int cpu;
+ const struct cpu_operations *ops = NULL;
+
set_cpu_online(smp_processor_id(), false);

local_daif_mask();
sdei_mask_local_cpu();
+
+#ifdef CONFIG_ARM64_CPU_PARK
+ /*
+ * Go to cpu park state.
+ * Otherwise go to cpu die.
+ */
+ cpu = smp_processor_id();
+ if (kexec_in_progress) {
+ machine_kexec_mask_interrupts();
+ if (park_start_v)
+ cpu_park(cpu);
+
+ ops = get_cpu_ops(cpu);
+ if (ops && ops->cpu_die)
+ ops->cpu_die(cpu);
+ }
+#endif
+
cpu_park_loop();
}

@@ -1052,6 +1233,45 @@ void smp_send_stop(void)
sdei_mask_local_cpu();
}

+#ifdef CONFIG_ARM64_CPU_PARK
+int kexec_smp_send_park(void)
+{
+ unsigned long cpu;
+
+ if (WARN_ON(!kexec_in_progress)) {
+ pr_crit("%s called not in kexec progress.", __func__);
+ return -EPERM;
+ }
+
+ if (mmap_cpu_park_mem() != 0) {
+ pr_info("no cpuparkmem, goto normal way.");
+ return -EPERM;
+ }
+
+ local_irq_disable();
+
+ if (num_online_cpus() > 1) {
+ cpumask_t mask;
+
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &mask);
+
+ for_each_cpu(cpu, &mask)
+ install_cpu_park(cpu);
+ smp_cross_call(&mask, IPI_CPU_STOP);
+
+ /* Wait for other CPUs to park */
+ for_each_cpu(cpu, &mask)
+ cpu_wait_park(cpu);
+ pr_info("smp park other cpus done\n");
+ }
+
+ sdei_mask_local_cpu();
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_KEXEC_CORE
void crash_smp_send_stop(void)
{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 69d4251..d25bb61 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -279,6 +279,58 @@ static void __init fdt_enforce_memory_region(void)
memblock_cap_memory_range(reg.base, reg.size);
}

+#ifdef CONFIG_ARM64_CPU_PARK
+/* Physical address of reserved park memory. */
+unsigned long park_start;
+/* Virtual address of reserved park memory. */
+unsigned long park_start_v;
+/* park reserve mem len should be PAGE_SIZE * NR_CPUS */
+unsigned long park_len = PAGE_SIZE * NR_CPUS;
+
+static int __init parse_park_mem(char *p)
+{
+ if (!p)
+ return 0;
+
+ park_start = PAGE_ALIGN(memparse(p, NULL));
+ if (park_start == 0)
+ pr_info("cpu park mem params[%s]", p);
+
+ return 0;
+}
+early_param("cpuparkmem", parse_park_mem);
+
+static int __init reserve_park_mem(void)
+{
+ if (park_start == 0 || park_len == 0)
+ return 0;
+
+ park_start = PAGE_ALIGN(park_start);
+ park_len = PAGE_ALIGN(park_len);
+
+ if (!memblock_is_region_memory(park_start, park_len)) {
+ pr_warn("cannot reserve park mem: region is not memory!");
+ goto out;
+ }
+
+ if (memblock_is_region_reserved(park_start, park_len)) {
+ pr_warn("cannot reserve park mem: region overlaps reserved memory!");
+ goto out;
+ }
+
+ memblock_reserve(park_start, park_len);
+ memblock_remove(park_start, park_len);
+ pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)",
+ park_start, park_start + park_len, park_len >> 20);
+
+ return 0;
+out:
+ park_start = 0;
+ park_len = 0;
+ return -EINVAL;
+}
+#endif
+
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
@@ -396,6 +448,10 @@ void __init arm64_memblock_init(void)
else
arm64_dma32_phys_limit = PHYS_MASK + 1;

+#ifdef CONFIG_ARM64_CPU_PARK
+ reserve_park_mem();
+#endif
+
reserve_elfcorehdr();

high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
--
2.9.5