Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

From: Alex Ghiti
Date: Thu Jul 09 2020 - 07:11:28 EST


Hi Palmer,

Le 7/9/20 Ã 1:05 AM, Palmer Dabbelt a ÃcritÂ:
On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@xxxxxxxx wrote:
This is a preparatory patch for relocatable kernel.

The kernel used to be linked at PAGE_OFFSET address and used to be loaded
physically at the beginning of the main memory. Therefore, we could use
the linear mapping for the kernel mapping.

But the relocated kernel base address will be different from PAGE_OFFSET
and since in the linear mapping, two different virtual addresses cannot
point to the same physical address, the kernel mapping needs to lie outside
the linear mapping.

I know it's been a while, but I keep opening this up to review it and just
can't get over how ugly it is to put the kernel's linear map in the vmalloc
region.

I guess I don't understand why this is necessary at all. Specifically: why
can't we just relocate the kernel within the linear map? That would let the
bootloader put the kernel wherever it wants, modulo the physical memory size we
support. We'd need to handle the regions that are coupled to the kernel's
execution address, but we could just put them in an explicit memory region
which is what we should probably be doing anyway.

Virtual relocation in the linear mapping requires to move the kernel physically too. Zong implemented this physical move in its KASLR RFC patchset, which is cumbersome since finding an available physical spot is harder than just selecting a virtual range in the vmalloc range.

In addition, having the kernel mapping in the linear mapping prevents the use of hugepage for the linear mapping resulting in performance loss (at least for the GB that encompasses the kernel).

Why do you find this "ugly" ? The vmalloc region is just a bunch of available virtual addresses to whatever purpose we want, and as noted by Zong, arm64 uses the same scheme.


In addition, because modules and BPF must be close to the kernel (inside
+-2GB window), the kernel is placed at the end of the vmalloc zone minus
2GB, which leaves room for modules and BPF. The kernel could not be
placed at the beginning of the vmalloc zone since other vmalloc
allocations from the kernel could get all the +-2GB window around the
kernel which would prevent new modules and BPF programs to be loaded.

Well, that's not enough to make sure this doesn't happen -- it's just enough to
make sure it doesn't happen very quickily. That's the same boat we're already
in, though, so it's not like it's worse.

Indeed, that's not worse, I haven't found a way to reserve vmalloc area without actually allocating it.


Signed-off-by: Alexandre Ghiti <alex@xxxxxxxx>
Reviewed-by: Zong Li <zong.li@xxxxxxxxxx>
---
Âarch/riscv/boot/loader.lds.SÂÂÂÂ |Â 3 +-
Âarch/riscv/include/asm/page.hÂÂÂ | 10 +++++-
Âarch/riscv/include/asm/pgtable.h | 38 ++++++++++++++-------
Âarch/riscv/kernel/head.SÂÂÂÂÂÂÂÂ |Â 3 +-
Âarch/riscv/kernel/module.cÂÂÂÂÂÂ |Â 4 +--
Âarch/riscv/kernel/vmlinux.lds.SÂ |Â 3 +-
Âarch/riscv/mm/init.cÂÂÂÂÂÂÂÂÂÂÂÂ | 58 +++++++++++++++++++++++++-------
Âarch/riscv/mm/physaddr.cÂÂÂÂÂÂÂÂ |Â 2 +-
Â8 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
index 47a5003c2e28..62d94696a19c 100644
--- a/arch/riscv/boot/loader.lds.S
+++ b/arch/riscv/boot/loader.lds.S
@@ -1,13 +1,14 @@
Â/* SPDX-License-Identifier: GPL-2.0 */

Â#include <asm/page.h>
+#include <asm/pgtable.h>

ÂOUTPUT_ARCH(riscv)
ÂENTRY(_start)

ÂSECTIONS
Â{
-ÂÂÂ . = PAGE_OFFSET;
+ÂÂÂ . = KERNEL_LINK_ADDR;

ÂÂÂÂ .payload : {
ÂÂÂÂÂÂÂÂ *(.payload)
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 2d50f76efe48..48bb09b6a9b7 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -90,18 +90,26 @@ typedef struct page *pgtable_t;

Â#ifdef CONFIG_MMU
Âextern unsigned long va_pa_offset;
+extern unsigned long va_kernel_pa_offset;
Âextern unsigned long pfn_base;
Â#define ARCH_PFN_OFFSETÂÂÂÂÂÂÂ (pfn_base)
Â#else
Â#define va_pa_offsetÂÂÂÂÂÂÂ 0
+#define va_kernel_pa_offsetÂÂÂ 0
Â#define ARCH_PFN_OFFSETÂÂÂÂÂÂÂ (PAGE_OFFSET >> PAGE_SHIFT)
Â#endif /* CONFIG_MMU */

Âextern unsigned long max_low_pfn;
Âextern unsigned long min_low_pfn;
+extern unsigned long kernel_virt_addr;

Â#define __pa_to_va_nodebug(x)ÂÂÂ ((void *)((unsigned long) (x) + va_pa_offset))
-#define __va_to_pa_nodebug(x)ÂÂÂ ((unsigned long)(x) - va_pa_offset)
+#define linear_mapping_va_to_pa(x)ÂÂÂ ((unsigned long)(x) - va_pa_offset)
+#define kernel_mapping_va_to_pa(x)ÂÂÂ \
+ÂÂÂ ((unsigned long)(x) - va_kernel_pa_offset)
+#define __va_to_pa_nodebug(x)ÂÂÂÂÂÂÂ \
+ÂÂÂ (((x) >= PAGE_OFFSET) ?ÂÂÂÂÂÂÂ \
+ÂÂÂÂÂÂÂ linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))

Â#ifdef CONFIG_DEBUG_VIRTUAL
Âextern phys_addr_t __virt_to_phys(unsigned long x);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 35b60035b6b0..94ef3b49dfb6 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -11,23 +11,29 @@

Â#include <asm/pgtable-bits.h>

-#ifndef __ASSEMBLY__
-
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include <linux/mm_types.h>
-
-#ifdef CONFIG_MMU
+#ifndef CONFIG_MMU
+#define KERNEL_VIRT_ADDRÂÂÂ PAGE_OFFSET
+#define KERNEL_LINK_ADDRÂÂÂ PAGE_OFFSET
+#else
+/*
+ * Leave 2GB for modules and BPF that must lie within a 2GB range around
+ * the kernel.
+ */
+#define KERNEL_VIRT_ADDRÂÂÂ (VMALLOC_END - SZ_2G + 1)
+#define KERNEL_LINK_ADDRÂÂÂ KERNEL_VIRT_ADDR

At a bare minimum this is going to make a mess of the 32-bit port, as
non-relocatable kernels are now going to get linked at 1GiB which is where user
code is supposed to live. That's an easy fix, though, as the 32-bit stuff
doesn't need any module address restrictions.

Indeed, I will take a look at that.


Â#define VMALLOC_SIZEÂÂÂÂ (KERN_VIRT_SIZE >> 1)
Â#define VMALLOC_ENDÂÂÂÂÂ (PAGE_OFFSET - 1)
Â#define VMALLOC_STARTÂÂÂ (PAGE_OFFSET - VMALLOC_SIZE)

Â#define BPF_JIT_REGION_SIZEÂÂÂ (SZ_128M)
-#define BPF_JIT_REGION_STARTÂÂÂ (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
-#define BPF_JIT_REGION_ENDÂÂÂ (VMALLOC_END)
+#define BPF_JIT_REGION_STARTÂÂÂ PFN_ALIGN((unsigned long)&_end)
+#define BPF_JIT_REGION_ENDÂÂÂ (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+
+#ifdef CONFIG_64BIT
+#define VMALLOC_MODULE_STARTÂÂÂ BPF_JIT_REGION_END
+#define VMALLOC_MODULE_ENDÂÂÂ (((unsigned long)&_start & PAGE_MASK) + SZ_2G)
+#endif

Â/*
 * Roughly size the vmemmap space to be large enough to fit enough
@@ -57,9 +63,16 @@
Â#define FIXADDR_SIZEÂÂÂÂ PGDIR_SIZE
Â#endif
Â#define FIXADDR_STARTÂÂÂ (FIXADDR_TOP - FIXADDR_SIZE)
-
Â#endif

+#ifndef __ASSEMBLY__
+
+/* Page Upper Directory not used in RISC-V */
+#include <asm-generic/pgtable-nopud.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <linux/mm_types.h>
+
Â#ifdef CONFIG_64BIT
Â#include <asm/pgtable-64.h>
Â#else
@@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl

Â#define kern_addr_valid(addr)ÂÂ (1) /* FIXME */

+extern char _start[];
Âextern void *dtb_early_va;
Âvoid setup_bootmem(void);
Âvoid paging_init(void);
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 98a406474e7d..8f5bb7731327 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -49,7 +49,8 @@ ENTRY(_start)
Â#ifdef CONFIG_MMU
Ârelocate:
ÂÂÂÂ /* Relocate return address */
-ÂÂÂ li a1, PAGE_OFFSET
+ÂÂÂ la a1, kernel_virt_addr
+ÂÂÂ REG_L a1, 0(a1)
ÂÂÂÂ la a2, _start
ÂÂÂÂ sub a1, a1, a2
ÂÂÂÂ add ra, ra, a1
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 8bbe5dbe1341..1a8fbe05accf 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
Â}

Â#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-#define VMALLOC_MODULE_START \
-ÂÂÂÂ max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
Âvoid *module_alloc(unsigned long size)
Â{
ÂÂÂÂ return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ VMALLOC_END, GFP_KERNEL,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ VMALLOC_MODULE_END, GFP_KERNEL,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ __builtin_return_address(0));
Â}
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 0339b6bbe11a..a9abde62909f 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -4,7 +4,8 @@
 * Copyright (C) 2017 SiFive
 */

-#define LOAD_OFFSET PAGE_OFFSET
+#include <asm/pgtable.h>
+#define LOAD_OFFSET KERNEL_LINK_ADDR
Â#include <asm/vmlinux.lds.h>
Â#include <asm/page.h>
Â#include <asm/cache.h>
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 736de6c8739f..71da78914645 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -22,6 +22,9 @@

Â#include "../kernel/head.h"

+unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
+EXPORT_SYMBOL(kernel_virt_addr);
+
Âunsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ __page_aligned_bss;
ÂEXPORT_SYMBOL(empty_zero_page);
@@ -178,8 +181,12 @@ void __init setup_bootmem(void)
Â}

Â#ifdef CONFIG_MMU
+/* Offset between linear mapping virtual address and kernel load address */
Âunsigned long va_pa_offset;
ÂEXPORT_SYMBOL(va_pa_offset);
+/* Offset between kernel mapping virtual address and kernel load address */
+unsigned long va_kernel_pa_offset;
+EXPORT_SYMBOL(va_kernel_pa_offset);
Âunsigned long pfn_base;
ÂEXPORT_SYMBOL(pfn_base);

@@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
ÂÂÂÂ if (mmu_enabled)
ÂÂÂÂÂÂÂÂ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);

-ÂÂÂ pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
+ÂÂÂ pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT;
ÂÂÂÂ BUG_ON(pmd_num >= NUM_EARLY_PMDS);
ÂÂÂÂ return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
Â}
@@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
Â#error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
Â#endif

+static uintptr_t load_pa, load_sz;
+
+static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
+{
+ÂÂÂ uintptr_t va, end_va;
+
+ÂÂÂ end_va = kernel_virt_addr + load_sz;
+ÂÂÂ for (va = kernel_virt_addr; va < end_va; va += map_size)
+ÂÂÂÂÂÂÂ create_pgd_mapping(pgdir, va,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa + (va - kernel_virt_addr),
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ map_size, PAGE_KERNEL_EXEC);
+}
+
Âasmlinkage void __init setup_vm(uintptr_t dtb_pa)
Â{
ÂÂÂÂ uintptr_t va, end_va;
-ÂÂÂ uintptr_t load_pa = (uintptr_t)(&_start);
-ÂÂÂ uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
ÂÂÂÂ uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);

+ÂÂÂ load_pa = (uintptr_t)(&_start);
+ÂÂÂ load_sz = (uintptr_t)(&_end) - load_pa;
+
ÂÂÂÂ va_pa_offset = PAGE_OFFSET - load_pa;
+ÂÂÂ va_kernel_pa_offset = kernel_virt_addr - load_pa;
+
ÂÂÂÂ pfn_base = PFN_DOWN(load_pa);

ÂÂÂÂ /*
@@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
ÂÂÂÂ create_pmd_mapping(fixmap_pmd, FIXADDR_START,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
ÂÂÂÂ /* Setup trampoline PGD and PMD */
-ÂÂÂ create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+ÂÂÂ create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
-ÂÂÂ create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
+ÂÂÂ create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
Â#else
ÂÂÂÂ /* Setup trampoline PGD */
-ÂÂÂ create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+ÂÂÂ create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
Â#endif

ÂÂÂÂ /*
-ÂÂÂÂ * Setup early PGD covering entire kernel which will allows
+ÂÂÂÂ * Setup early PGD covering entire kernel which will allow
ÂÂÂÂÂ * us to reach paging_init(). We map all memory banks later
ÂÂÂÂÂ * in setup_vm_final() below.
ÂÂÂÂÂ */
-ÂÂÂ end_va = PAGE_OFFSET + load_sz;
-ÂÂÂ for (va = PAGE_OFFSET; va < end_va; va += map_size)
-ÂÂÂÂÂÂÂ create_pgd_mapping(early_pg_dir, va,
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa + (va - PAGE_OFFSET),
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ map_size, PAGE_KERNEL_EXEC);
+ÂÂÂ create_kernel_page_table(early_pg_dir, map_size);

ÂÂÂÂ /* Create fixed mapping for early FDT parsing */
ÂÂÂÂ end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE;
@@ -441,6 +460,7 @@ static void __init setup_vm_final(void)
ÂÂÂÂ uintptr_t va, map_size;
ÂÂÂÂ phys_addr_t pa, start, end;
ÂÂÂÂ struct memblock_region *reg;
+ÂÂÂ static struct vm_struct vm_kernel = { 0 };

ÂÂÂÂ /* Set mmu_enabled flag */
ÂÂÂÂ mmu_enabled = true;
@@ -467,10 +487,22 @@ static void __init setup_vm_final(void)
ÂÂÂÂÂÂÂÂ for (pa = start; pa < end; pa += map_size) {
ÂÂÂÂÂÂÂÂÂÂÂÂ va = (uintptr_t)__va(pa);
ÂÂÂÂÂÂÂÂÂÂÂÂ create_pgd_mapping(swapper_pg_dir, va, pa,
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ map_size, PAGE_KERNEL_EXEC);
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ map_size, PAGE_KERNEL);
ÂÂÂÂÂÂÂÂ }
ÂÂÂÂ }

+ÂÂÂ /* Map the kernel */
+ÂÂÂ create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
+
+ÂÂÂ /* Reserve the vmalloc area occupied by the kernel */
+ÂÂÂ vm_kernel.addr = (void *)kernel_virt_addr;
+ÂÂÂ vm_kernel.phys_addr = load_pa;
+ÂÂÂ vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1);
+ÂÂÂ vm_kernel.flags = VM_MAP | VM_NO_GUARD;
+ÂÂÂ vm_kernel.caller = __builtin_return_address(0);
+
+ÂÂÂ vm_area_add_early(&vm_kernel);
+
ÂÂÂÂ /* Clear fixmap PTE and PMD mappings */
ÂÂÂÂ clear_fixmap(FIX_PTE);
ÂÂÂÂ clear_fixmap(FIX_PMD);
diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c
index e8e4dcd39fed..35703d5ef5fd 100644
--- a/arch/riscv/mm/physaddr.c
+++ b/arch/riscv/mm/physaddr.c
@@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);

Âphys_addr_t __phys_addr_symbol(unsigned long x)
Â{
-ÂÂÂ unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
+ÂÂÂ unsigned long kernel_start = (unsigned long)kernel_virt_addr;
ÂÂÂÂ unsigned long kernel_end = (unsigned long)_end;

ÂÂÂÂ /*

Alex