Introduces memory hotplug functionality (hot-add) for arm64.
Changes v1->v2:
- swapper pgtable updated in place on hot add, avoiding unnecessary copy:
all changes are additive and non destructive.
- stop_machine used to updated swapper on hot add, avoiding races
- checking if pagealloc is under debug to stay coherent with mem_map
Signed-off-by: Maciej Bielski <m.bielski@xxxxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Andrea Reale <ar@xxxxxxxxxxxxxxxxxx>
---
arch/arm64/Kconfig | 12 ++++++
arch/arm64/configs/defconfig | 1 +
arch/arm64/include/asm/mmu.h | 3 ++
arch/arm64/mm/init.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
arch/arm64/mm/mmu.c | 39 ++++++++++++++++++++
5 files changed, 142 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0df64a6..c736bba 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -641,6 +641,14 @@ config HOTPLUG_CPU
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.
+config ARCH_HAS_ADD_PAGES
+ def_bool y
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
+
+config ARCH_ENABLE_MEMORY_HOTPLUG
+ def_bool y
+ depends on !NUMA
+
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
@@ -715,6 +723,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
source "mm/Kconfig"
+config ARCH_MEMORY_PROBE
+ def_bool y
+ depends on MEMORY_HOTPLUG
+
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
---help---
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 34480e9..5fc5656 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -80,6 +80,7 @@ CONFIG_ARM64_VA_BITS_48=y
CONFIG_SCHED_MC=y
CONFIG_NUMA=y
CONFIG_PREEMPT=y
+CONFIG_MEMORY_HOTPLUG=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_CMA=y
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 0d34bf0..2b3fa4d 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -40,5 +40,8 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
pgprot_t prot, bool page_mappings_only);
extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
extern void mark_linear_text_alias_ro(void);
+#ifdef CONFIG_MEMORY_HOTPLUG
+extern void hotplug_paging(phys_addr_t start, phys_addr_t size);
+#endif
#endif
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 5960bef..e96e7d3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -722,3 +722,90 @@ static int __init register_mem_limit_dumper(void)
return 0;
}
__initcall(register_mem_limit_dumper);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
+{
+ int ret;
+ u64 start_addr = start_pfn << PAGE_SHIFT;
+ /*
+ * Mark the first page in the range as unusable. This is needed
+ * because __add_section (within __add_pages) wants pfn_valid
+ * of it to be false, and in arm64 pfn falid is implemented by
+ * just checking at the nomap flag for existing blocks.
+ *
+ * A small trick here is that __add_section() requires only
+ * phys_start_pfn (that is the first pfn of a section) to be
+ * invalid. Regardless of whether it was assumed (by the function
+ * author) that all pfns within a section are either all valid
+ * or all invalid, it allows to avoid looping twice (once here,
+ * second when memblock_clear_nomap() is called) through all
+ * pfns of the section and modify only one pfn. Thanks to that,
+ * further, in __add_zone() only this very first pfn is skipped
+ * and corresponding page is not flagged reserved. Therefore it
+ * is enough to correct this setup only for it.
+ *
+ * When arch_add_memory() returns the walk_memory_range() function
+ * is called and passed with online_memory_block() callback,
+ * which execution finally reaches the memory_block_action()
+ * function, where also only the first pfn of a memory block is
+ * checked to be reserved. Above, it was first pfn of a section,
+ * here it is a block but
+ * (drivers/base/memory.c):
+ * sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+ * (include/linux/memory.h):
+ * #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
+ * so we can consider block and section equivalently
+ */
+ memblock_mark_nomap(start_addr, 1<<PAGE_SHIFT);
+ ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
+
+ /*
+ * Make the pages usable after they have been added.
+ * This will make pfn_valid return true
+ */
+ memblock_clear_nomap(start_addr, 1<<PAGE_SHIFT);
+
+ /*
+ * This is a hack to avoid having to mix arch specific code
+ * into arch independent code. SetPageReserved is supposed
+ * to be called by __add_zone (within __add_section, within
+ * __add_pages). However, when it is called there, it assumes that
+ * pfn_valid returns true. For the way pfn_valid is implemented
+ * in arm64 (a check on the nomap flag), the only way to make
+ * this evaluate true inside __add_zone is to clear the nomap
+ * flags of blocks in architecture independent code.
+ *
+ * To avoid this, we set the Reserved flag here after we cleared
+ * the nomap flag in the line above.
+ */
+ SetPageReserved(pfn_to_page(start_pfn));
+
+ return ret;
+}
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+ int ret;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+
+ if (end_pfn > max_sparsemem_pfn) {
+ pr_err("end_pfn too big");
+ return -1;
+ }
+ hotplug_paging(start, size);
+
+ ret = add_pages(nid, start_pfn, nr_pages, want_memblock);
+
+ if (ret)
+ pr_warn("%s: Problem encountered in __add_pages() ret=%d\n",
+ __func__, ret);
+
+ return ret;
+}
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f1eb15e..d93043d 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -28,6 +28,7 @@
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/stop_machine.h>
#include <linux/fs.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -615,6 +616,44 @@ void __init paging_init(void)
SWAPPER_DIR_SIZE - PAGE_SIZE);
}
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+/*
+ * hotplug_paging() is used by memory hotplug to build new page tables
+ * for hot added memory.
+ */
+
+struct mem_range {
+ phys_addr_t base;
+ phys_addr_t size;
+};
+
+static int __hotplug_paging(void *data)
+{
+ int flags = 0;
+ struct mem_range *section = data;
+
+ if (debug_pagealloc_enabled())
+ flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+
+ __create_pgd_mapping(swapper_pg_dir, section->base,
+ __phys_to_virt(section->base), section->size,
+ PAGE_KERNEL, pgd_pgtable_alloc, flags);
+
+ return 0;
+}
+
+inline void hotplug_paging(phys_addr_t start, phys_addr_t size)
+{
+ struct mem_range section = {
+ .base = start,
+ .size = size,
+ };
+
+ stop_machine(__hotplug_paging, §ion, NULL);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
/*
* Check whether a kernel address is valid (derived from arch/x86/).
*/