Re: [PATCH V4 4/4] arm64/mm: Enable memory hot remove
From: David Hildenbrand
Date: Tue May 21 2019 - 06:22:43 EST
On 20.05.19 07:18, Anshuman Khandual wrote:
> The arch code for hot-remove must tear down portions of the linear map and
> vmemmap corresponding to memory being removed. In both cases the page
> tables mapping these regions must be freed, and when sparse vmemmap is in
> use the memory backing the vmemmap must also be freed.
>
> This patch adds a new remove_pagetable() helper which can be used to tear
> down either region, and calls it from vmemmap_free() and
> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
> backing memory will be freed.
>
> While freeing intermediate level page table pages bail out if any of it's
> entries are still valid. This can happen for partially filled kernel page
> table either from a previously attempted failed memory hot add or while
> removing an address range which does not span the entire page table page
> range.
>
> The vmemmap region may share levels of table with the vmalloc region. Take
> the kernel ptl so that we can safely free potentially-shared tables.
>
> While here update arch_add_memory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture.
>
> Signed-off-by: Anshuman Khandual <anshuman.khandual@xxxxxxx>
> ---
> arch/arm64/Kconfig | 3 +
> arch/arm64/mm/mmu.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 213 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 4780eb7..ce24427 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -267,6 +267,9 @@ config HAVE_GENERIC_GUP
> config ARCH_ENABLE_MEMORY_HOTPLUG
> def_bool y
>
> +config ARCH_ENABLE_MEMORY_HOTREMOVE
> + def_bool y
> +
> config SMP
> def_bool y
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index a1bfc44..0cf0d41 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -733,6 +733,187 @@ int kern_addr_valid(unsigned long addr)
>
> return pfn_valid(pte_pfn(pte));
> }
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_hotplug_page_range(struct page *page, ssize_t size)
> +{
> + WARN_ON(PageReserved(page));
> + free_pages((unsigned long)page_address(page), get_order(size));
> +}
> +
> +static void free_hotplug_pgtable_page(struct page *page)
> +{
> + free_hotplug_page_range(page, PAGE_SIZE);
> +}
> +
> +static void free_pte_table(pte_t *ptep, pmd_t *pmdp, unsigned long addr)
> +{
> + struct page *page;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + if (!pte_none(ptep[i]))
> + return;
> + }
> +
> + page = pmd_page(READ_ONCE(*pmdp));
> + pmd_clear(pmdp);
> + __flush_tlb_kernel_pgtable(addr);
> + free_hotplug_pgtable_page(page);
> +}
> +
> +static void free_pmd_table(pmd_t *pmdp, pud_t *pudp, unsigned long addr)
> +{
> + struct page *page;
> + int i;
> +
> + if (CONFIG_PGTABLE_LEVELS <= 2)
> + return;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + if (!pmd_none(pmdp[i]))
> + return;
> + }
> +
> + page = pud_page(READ_ONCE(*pudp));
> + pud_clear(pudp);
> + __flush_tlb_kernel_pgtable(addr);
> + free_hotplug_pgtable_page(page);
> +}
> +
> +static void free_pud_table(pud_t *pudp, pgd_t *pgdp, unsigned long addr)
> +{
> + struct page *page;
> + int i;
> +
> + if (CONFIG_PGTABLE_LEVELS <= 3)
> + return;
> +
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + if (!pud_none(pudp[i]))
> + return;
> + }
> +
> + page = pgd_page(READ_ONCE(*pgdp));
> + pgd_clear(pgdp);
> + __flush_tlb_kernel_pgtable(addr);
> + free_hotplug_pgtable_page(page);
> +}
> +
> +static void
> +remove_pte_table(pmd_t *pmdp, unsigned long addr,
> + unsigned long end, bool sparse_vmap)
> +{
> + struct page *page;
> + pte_t *ptep, pte;
> + unsigned long start = addr;
> +
> + for (; addr < end; addr += PAGE_SIZE) {
> + ptep = pte_offset_kernel(pmdp, addr);
> + pte = READ_ONCE(*ptep);
> +
> + if (pte_none(pte))
> + continue;
> +
> + WARN_ON(!pte_present(pte));
> + if (sparse_vmap) {
> + page = pte_page(pte);
> + free_hotplug_page_range(page, PAGE_SIZE);
> + }
> + pte_clear(&init_mm, addr, ptep);
> + }
> + flush_tlb_kernel_range(start, end);
> +}
> +
> +static void
> +remove_pmd_table(pud_t *pudp, unsigned long addr,
> + unsigned long end, bool sparse_vmap)
> +{
> + unsigned long next;
> + struct page *page;
> + pte_t *ptep_base;
> + pmd_t *pmdp, pmd;
> +
> + for (; addr < end; addr = next) {
> + next = pmd_addr_end(addr, end);
> + pmdp = pmd_offset(pudp, addr);
> + pmd = READ_ONCE(*pmdp);
> +
> + if (pmd_none(pmd))
> + continue;
> +
> + WARN_ON(!pmd_present(pmd));
> + if (pmd_sect(pmd)) {
> + if (sparse_vmap) {
> + page = pmd_page(pmd);
> + free_hotplug_page_range(page, PMD_SIZE);
> + }
> + pmd_clear(pmdp);
> + continue;
> + }
> + ptep_base = pte_offset_kernel(pmdp, 0UL);
> + remove_pte_table(pmdp, addr, next, sparse_vmap);
> + free_pte_table(ptep_base, pmdp, addr);
> + }
> +}
> +
> +static void
> +remove_pud_table(pgd_t *pgdp, unsigned long addr,
> + unsigned long end, bool sparse_vmap)
> +{
> + unsigned long next;
> + struct page *page;
> + pmd_t *pmdp_base;
> + pud_t *pudp, pud;
> +
> + for (; addr < end; addr = next) {
> + next = pud_addr_end(addr, end);
> + pudp = pud_offset(pgdp, addr);
> + pud = READ_ONCE(*pudp);
> +
> + if (pud_none(pud))
> + continue;
> +
> + WARN_ON(!pud_present(pud));
> + if (pud_sect(pud)) {
> + if (sparse_vmap) {
> + page = pud_page(pud);
> + free_hotplug_page_range(page, PUD_SIZE);
> + }
> + pud_clear(pudp);
> + continue;
> + }
> + pmdp_base = pmd_offset(pudp, 0UL);
> + remove_pmd_table(pudp, addr, next, sparse_vmap);
> + free_pmd_table(pmdp_base, pudp, addr);
> + }
> +}
> +
> +static void
> +remove_pagetable(unsigned long start, unsigned long end, bool sparse_vmap)
> +{
> + unsigned long addr, next;
> + pud_t *pudp_base;
> + pgd_t *pgdp, pgd;
> +
> + spin_lock(&init_mm.page_table_lock);
> + for (addr = start; addr < end; addr = next) {
> + next = pgd_addr_end(addr, end);
> + pgdp = pgd_offset_k(addr);
> + pgd = READ_ONCE(*pgdp);
> +
> + if (pgd_none(pgd))
> + continue;
> +
> + WARN_ON(!pgd_present(pgd));
> + pudp_base = pud_offset(pgdp, 0UL);
> + remove_pud_table(pgdp, addr, next, sparse_vmap);
> + free_pud_table(pudp_base, pgdp, addr);
> + }
> + spin_unlock(&init_mm.page_table_lock);
> +}
> +#endif
> +
> #ifdef CONFIG_SPARSEMEM_VMEMMAP
> #if !ARM64_SWAPPER_USES_SECTION_MAPS
> int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> @@ -780,6 +961,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
> void vmemmap_free(unsigned long start, unsigned long end,
> struct vmem_altmap *altmap)
> {
> +#ifdef CONFIG_MEMORY_HOTPLUG
> + remove_pagetable(start, end, true);
> +#endif
> }
> #endif /* CONFIG_SPARSEMEM_VMEMMAP */
>
> @@ -1070,10 +1254,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
> }
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
> +{
> + WARN_ON(pgdir != init_mm.pgd);
> + remove_pagetable(start, start + size, false);
> +}
> +
> int arch_add_memory(int nid, u64 start, u64 size,
> struct mhp_restrictions *restrictions)
> {
> - int flags = 0;
> + int ret, flags = 0;
>
> if (rodata_full || debug_pagealloc_enabled())
> flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> @@ -1081,7 +1271,25 @@ int arch_add_memory(int nid, u64 start, u64 size,
> __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
> size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>
> - return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
> + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
> restrictions);
> + if (ret)
> + __remove_pgd_mapping(swapper_pg_dir,
> + __phys_to_virt(start), size);
Nit: Indentation of the parameters looks really weird.
> + return ret;
> +}
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +void arch_remove_memory(int nid, u64 start, u64 size,
> + struct vmem_altmap *altmap)
> +{
> + unsigned long start_pfn = start >> PAGE_SHIFT;
> + unsigned long nr_pages = size >> PAGE_SHIFT;
> + struct zone *zone = page_zone(pfn_to_page(start_pfn));
> +
> + __remove_pages(zone, start_pfn, nr_pages, altmap);
> + __remove_pgd_mapping(swapper_pg_dir,
> + __phys_to_virt(start), size);
Dito, indentation of the parameters.
For these two changes (arch_*_memory)
Acked-by: David Hildenbrand <david@xxxxxxxxxx>
--
Thanks,
David / dhildenb