Re: [PATCH RESEND v2] riscv: Introduce CONFIG_RELOCATABLE

From: Alex Ghiti
Date: Sat Mar 07 2020 - 05:20:51 EST


Hi Palmer,

On 3/6/20 12:58 PM, Palmer Dabbelt wrote:
On Mon, 02 Mar 2020 21:44:37 PST (-0800), alex@xxxxxxxx wrote:
This config allows to compile the kernel as PIE and to relocate it at any
virtual address at runtime: this paves the way to KASLR and to 4-level
page table folding at runtime. Runtime relocation is possible since
relocation metadata are embedded into the kernel.

Note that relocating at runtime introduces an overhead even if the kernel
is loaded at the same address it was linked at and that the compiler
options are those used in arm64 which uses the same RELA relocation format.

Signed-off-by: Alexandre Ghiti <alex@xxxxxxxx>
Reviewed-by: Zong Li <zong.li@xxxxxxxxxx>
Reviewed-by: Anup Patel <anup@xxxxxxxxxxxxxx>
Tested-by: Zong Li <zong.li@xxxxxxxxxx>
---
Changes in v2:
- Make RELOCATABLE depend on MMU as suggested by Anup
- Rename kernel_load_addr into kernel_virt_addr as suggested by Anup
- Use __pa_symbol instead of __pa, as suggested by Zong
- Rebased on top of v5.6-rc3
- Tested with sv48 patchset
- Add Reviewed/Tested-by from Zong and Anup

Âarch/riscv/KconfigÂÂÂÂÂÂÂÂÂÂÂÂÂ | 12 +++++
Âarch/riscv/MakefileÂÂÂÂÂÂÂÂÂÂÂÂ |Â 5 +-
Âarch/riscv/boot/loader.lds.SÂÂÂ |Â 2 +-
Âarch/riscv/include/asm/page.hÂÂ |Â 5 +-
Âarch/riscv/kernel/head.SÂÂÂÂÂÂÂ |Â 3 +-
Âarch/riscv/kernel/vmlinux.lds.S | 10 ++--
Âarch/riscv/mm/MakefileÂÂÂÂÂÂÂÂÂ |Â 4 ++
Âarch/riscv/mm/init.cÂÂÂÂÂÂÂÂÂÂÂ | 92 ++++++++++++++++++++++++++++-----
Â8 files changed, 111 insertions(+), 22 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 73f029eae0cc..f5f3d474504d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -163,6 +163,18 @@ config PGTABLE_LEVELS
ÂÂÂÂ default 3 if 64BIT
ÂÂÂÂ default 2

+config RELOCATABLE
+ÂÂÂ bool
+ÂÂÂ depends on MMU
+ÂÂÂ help
+ÂÂÂÂÂÂÂÂÂ This builds a kernel as a Position Independent Executable (PIE),
+ÂÂÂÂÂÂÂÂÂ which retains all relocation metadata required to relocate the
+ÂÂÂÂÂÂÂÂÂ kernel binary at runtime to a different virtual address than the
+ÂÂÂÂÂÂÂÂÂ address it was linked at.
+ÂÂÂÂÂÂÂÂÂ Since RISCV uses the RELA relocation format, this requires a
+ÂÂÂÂÂÂÂÂÂ relocation pass at runtime even if the kernel is loaded at the
+ÂÂÂÂÂÂÂÂÂ same address it was linked at.
+
Âsource "arch/riscv/Kconfig.socs"

Âmenu "Platform type"
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index b9009a2fbaf5..5a115cf6a9c1 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -9,7 +9,10 @@
Â#

ÂOBJCOPYFLAGSÂÂÂ := -O binary
-LDFLAGS_vmlinux :=
+ifeq ($(CONFIG_RELOCATABLE),y)
+LDFLAGS_vmlinux := -shared -Bsymbolic -z notext -z norelro
+KBUILD_CFLAGS += -fPIE
+endif
Âifeq ($(CONFIG_DYNAMIC_FTRACE),y)
ÂÂÂÂ LDFLAGS_vmlinux := --no-relax
Âendif
diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
index 47a5003c2e28..a9ed218171aa 100644
--- a/arch/riscv/boot/loader.lds.S
+++ b/arch/riscv/boot/loader.lds.S
@@ -7,7 +7,7 @@ ENTRY(_start)

ÂSECTIONS
Â{
-ÂÂÂ . = PAGE_OFFSET;
+ÂÂÂ . = CONFIG_PAGE_OFFSET;

ÂÂÂÂ .payload : {
ÂÂÂÂÂÂÂÂ *(.payload)
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 8ca1930caa44..af5810f9aebd 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -31,9 +31,9 @@
 * When not using MMU this corresponds to the first free page in
 * physical memory (aligned on a page boundary).
 */
-#define PAGE_OFFSETÂÂÂÂÂÂÂ _AC(CONFIG_PAGE_OFFSET, UL)
+#define PAGE_OFFSETÂÂÂÂÂÂÂ kernel_virt_addr

I assume we want to keep PAGE_OFFSET a constant for the non-relocatable
systems. As it currently stands this is imposing a performance hit even when

Ok I can do that, thanks.

-#define KERN_VIRT_SIZE (-PAGE_OFFSET)
+#define KERN_VIRT_SIZEÂÂÂÂÂÂÂ (-_AC(CONFIG_PAGE_OFFSET, UL))

This seems like it would cause issues if the kernel is relocated to high enough
addresses that "kernel_virt_addr+KERN_VIRT_SIZE" overflows.

KERN_VIRT_SIZE is only used to define the size of VMALLOC zone and VMALLOC zone must aligned on 2MB (size of PMD page): so I simply used CONFIG_PAGE_OFFSET that guarantees that (since aligned on 1GB).

But I should define KERN_VIRT_SIZE by using kernel_virt_addr and make sure that VMALLOC zone is aligned on 2MB.

Ok I'll fix that in v3, thanks.


Â#ifndef __ASSEMBLY__

@@ -97,6 +97,7 @@ extern unsigned long pfn_base;
Â#define ARCH_PFN_OFFSETÂÂÂÂÂÂÂ (PAGE_OFFSET >> PAGE_SHIFT)
Â#endif /* CONFIG_MMU */

+extern unsigned long kernel_virt_addr;
Âextern unsigned long max_low_pfn;
Âextern unsigned long min_low_pfn;

diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 271860fc2c3f..d792912c2da3 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -131,7 +131,8 @@ clear_bss_done:
Â#ifdef CONFIG_MMU
Ârelocate:
ÂÂÂÂ /* Relocate return address */
-ÂÂÂ li a1, PAGE_OFFSET
+ÂÂÂ la a1, kernel_virt_addr
+ÂÂÂ REG_L a1, 0(a1)
ÂÂÂÂ la a2, _start
ÂÂÂÂ sub a1, a1, a2
ÂÂÂÂ add ra, ra, a1
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 1e0193ded420..5bf69e9b91e6 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -4,7 +4,7 @@
 * Copyright (C) 2017 SiFive
 */

-#define LOAD_OFFSET PAGE_OFFSET
+#define LOAD_OFFSET CONFIG_PAGE_OFFSET
Â#include <asm/vmlinux.lds.h>
Â#include <asm/page.h>
Â#include <asm/cache.h>
@@ -71,9 +71,11 @@ SECTIONS

ÂÂÂÂ EXCEPTION_TABLE(0x10)

-ÂÂÂ .rel.dyn : {
-ÂÂÂÂÂÂÂ *(.rel.dyn*)
-ÂÂÂ }
+ÂÂÂÂÂÂÂ .rela.dyn : ALIGN(8) {
+ÂÂÂÂÂÂÂ __rela_dyn_start = .;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ *(.rela .rela*)
+ÂÂÂÂÂÂÂ __rela_dyn_end = .;
+ÂÂÂÂÂÂÂ }

It looks like the indentation is screwed up here: I see a mix of tabs/spaces
that doesn't match the rest of the file.


Thanks, I'll fix it.


ÂÂÂÂ _end = .;

diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index 50b7af58c566..27593d362248 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -1,6 +1,10 @@
Â# SPDX-License-Identifier: GPL-2.0-only

ÂCFLAGS_init.o := -mcmodel=medany
+ifdef CONFIG_RELOCATABLE
+CFLAGS_init.o += -fno-pie
+endif
+
Âifdef CONFIG_FTRACE
ÂCFLAGS_REMOVE_init.o = -pg
Âendif
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 965a8cf4829c..428aee2669aa 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -12,6 +12,9 @@
Â#include <linux/sizes.h>
Â#include <linux/of_fdt.h>
Â#include <linux/libfdt.h>
+#ifdef CONFIG_RELOCATABLE
+#include <linux/elf.h>
+#endif

Â#include <asm/fixmap.h>
Â#include <asm/tlbflush.h>
@@ -28,6 +31,9 @@ EXPORT_SYMBOL(empty_zero_page);
Âextern char _start[];
Âvoid *dtb_early_va;

+unsigned long kernel_virt_addr = _AC(CONFIG_PAGE_OFFSET, UL);
+EXPORT_SYMBOL(kernel_virt_addr);
+
Âstatic void __init zone_sizes_init(void)
Â{
ÂÂÂÂ unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, };
@@ -132,7 +138,8 @@ void __init setup_bootmem(void)
ÂÂÂÂÂÂÂÂ phys_addr_t end = reg->base + reg->size;

ÂÂÂÂÂÂÂÂ if (reg->base <= vmlinux_end && vmlinux_end <= end) {
-ÂÂÂÂÂÂÂÂÂÂÂ mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET);
+ÂÂÂÂÂÂÂÂÂÂÂ mem_size = min(reg->size,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ (phys_addr_t)-kernel_virt_addr);

PAGE_OFFSET is kernel_virt_addr, so I don't see any reason to change these --
they account for a significant fraction of the diff.

You are totally right, those are leftovers from a previous version. Thanks for catching that, that greatly simplifies this patch.


ÂÂÂÂÂÂÂÂÂÂÂÂ /*
ÂÂÂÂÂÂÂÂÂÂÂÂÂ * Remove memblock from the end of usable area to the
@@ -269,7 +276,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
ÂÂÂÂ if (mmu_enabled)
ÂÂÂÂÂÂÂÂ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);

-ÂÂÂ pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
+ÂÂÂ pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT;
ÂÂÂÂ BUG_ON(pmd_num >= NUM_EARLY_PMDS);
ÂÂÂÂ return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
Â}
@@ -370,6 +377,54 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
Â#error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
Â#endif

+#ifdef CONFIG_RELOCATABLE
+extern unsigned long __rela_dyn_start, __rela_dyn_end;
+
+#ifdef CONFIG_64BIT
+#define Elf_Rela Elf64_Rela
+#define Elf_Addr Elf64_Addr
+#else
+#define Elf_Rela Elf32_Rela
+#define Elf_Addr Elf32_Addr
+#endif
+
+void __init relocate_kernel(uintptr_t load_pa)
+{
+ÂÂÂ Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start;
+ÂÂÂ uintptr_t link_addr = _AC(CONFIG_PAGE_OFFSET, UL);
+ÂÂÂ /*
+ÂÂÂÂ * This holds the offset between the linked virtual address and the
+ÂÂÂÂ * relocated virtual address.
+ÂÂÂÂ */
+ÂÂÂ uintptr_t reloc_offset = kernel_virt_addr - link_addr;
+ÂÂÂ /*
+ÂÂÂÂ * This holds the offset between linked virtual address and physical
+ÂÂÂÂ * address whereas va_pa_offset holds the offset between relocated
+ÂÂÂÂ * virtual address and physical address.
+ÂÂÂÂ */
+ÂÂÂ uintptr_t va_link_pa_offset = link_addr - load_pa;
+
+ÂÂÂ for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) {
+ÂÂÂÂÂÂÂ Elf_Addr addr = (rela->r_offset - va_link_pa_offset);
+ÂÂÂÂÂÂÂ Elf_Addr relocated_addr = rela->r_addend;
+
+ÂÂÂÂÂÂÂ if (rela->r_info != R_RISCV_RELATIVE)
+ÂÂÂÂÂÂÂÂÂÂÂ continue;

This should at least provide a warning when it encounters an unresolvable
relocation. Is it currently stands this just ignores all other runtime
relocations, and while I can buy the argument there shouldn't be any (though
I'd expect R_RISCV_{32,64} to show up?) we certainly shouldn't just silently
skip them.


It's hard to warn that early in the boot process: what about we do like powerpc and warns at compile time ? In addition, it would be better that way since we can eliminate false-positive like in 43e76cd368fb (because the only R_RISCV_64 relocations are indeed false-positive).

+
+ÂÂÂÂÂÂÂ /*
+ÂÂÂÂÂÂÂÂ * Make sure to not relocate vdso symbols like rt_sigreturn
+ÂÂÂÂÂÂÂÂ * which are linked from the address 0 in vmlinux since
+ÂÂÂÂÂÂÂÂ * vdso symbol addresses are actually used as an offset from
+ÂÂÂÂÂÂÂÂ * mm->context.vdso in VDSO_OFFSET macro.
+ÂÂÂÂÂÂÂÂ */
+ÂÂÂÂÂÂÂ if (relocated_addr >= link_addr)
+ÂÂÂÂÂÂÂÂÂÂÂ relocated_addr += reloc_offset;
+
+ÂÂÂÂÂÂÂ *(Elf_Addr *)addr = relocated_addr;
+ÂÂÂ }
+}
+#endif
+
Âasmlinkage void __init setup_vm(uintptr_t dtb_pa)
Â{
ÂÂÂÂ uintptr_t va, end_va;
@@ -377,9 +432,20 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
ÂÂÂÂ uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
ÂÂÂÂ uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);

-ÂÂÂ va_pa_offset = PAGE_OFFSET - load_pa;
+ÂÂÂ va_pa_offset = kernel_virt_addr - load_pa;
ÂÂÂÂ pfn_base = PFN_DOWN(load_pa);

+#ifdef CONFIG_RELOCATABLE
+ÂÂÂ /*
+ÂÂÂÂ * Early page table uses only one PGDIR, which makes it possible
+ÂÂÂÂ * to map 1GB aligned on 1GB: if the relocation offset makes the kernel
+ÂÂÂÂ * cross over a 1G boundary, raise a bug since a part of the kernel
+ÂÂÂÂ * would not get mapped.
+ÂÂÂÂ */
+ÂÂÂ BUG_ON(SZ_1G - (kernel_virt_addr & (SZ_1G - 1)) < load_sz);
+ÂÂÂ relocate_kernel(load_pa);
+#endif
+
ÂÂÂÂ /*
ÂÂÂÂÂ * Enforce boot alignment requirements of RV32 and
ÂÂÂÂÂ * RV64 by only allowing PMD or PGD mappings.
@@ -387,7 +453,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
ÂÂÂÂ BUG_ON(map_size == PAGE_SIZE);

ÂÂÂÂ /* Sanity check alignment and size */
-ÂÂÂ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
+ÂÂÂ BUILD_BUG_ON((_AC(CONFIG_PAGE_OFFSET, UL) % PGDIR_SIZE) != 0);
ÂÂÂÂ BUG_ON((load_pa % map_size) != 0);
ÂÂÂÂ BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);

@@ -400,13 +466,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
ÂÂÂÂ create_pmd_mapping(fixmap_pmd, FIXADDR_START,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
ÂÂÂÂ /* Setup trampoline PGD and PMD */
-ÂÂÂ create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+ÂÂÂ create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
-ÂÂÂ create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
+ÂÂÂ create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
Â#else
ÂÂÂÂ /* Setup trampoline PGD */
-ÂÂÂ create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+ÂÂÂ create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
Â#endif

@@ -415,10 +481,10 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
ÂÂÂÂÂ * us to reach paging_init(). We map all memory banks later
ÂÂÂÂÂ * in setup_vm_final() below.
ÂÂÂÂÂ */
-ÂÂÂ end_va = PAGE_OFFSET + load_sz;
-ÂÂÂ for (va = PAGE_OFFSET; va < end_va; va += map_size)
+ÂÂÂ end_va = kernel_virt_addr + load_sz;
+ÂÂÂ for (va = kernel_virt_addr; va < end_va; va += map_size)
ÂÂÂÂÂÂÂÂ create_pgd_mapping(early_pg_dir, va,
-ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa + (va - PAGE_OFFSET),
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ load_pa + (va - kernel_virt_addr),
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ map_size, PAGE_KERNEL_EXEC);

ÂÂÂÂ /* Create fixed mapping for early FDT parsing */
@@ -457,9 +523,9 @@ static void __init setup_vm_final(void)
ÂÂÂÂÂÂÂÂÂÂÂÂ break;
ÂÂÂÂÂÂÂÂ if (memblock_is_nomap(reg))
ÂÂÂÂÂÂÂÂÂÂÂÂ continue;
-ÂÂÂÂÂÂÂ if (start <= __pa(PAGE_OFFSET) &&
-ÂÂÂÂÂÂÂÂÂÂÂ __pa(PAGE_OFFSET) < end)
-ÂÂÂÂÂÂÂÂÂÂÂ start = __pa(PAGE_OFFSET);
+ÂÂÂÂÂÂÂ if (start <= __pa_symbol(kernel_virt_addr) &&
+ÂÂÂÂÂÂÂÂÂÂÂ __pa(kernel_virt_addr) < end)
+ÂÂÂÂÂÂÂÂÂÂÂ start = __pa_symbol(kernel_virt_addr);

ÂÂÂÂÂÂÂÂ map_size = best_map_size(start, end - start);
ÂÂÂÂÂÂÂÂ for (pa = start; pa < end; pa += map_size) {

Thanks Palmer,

Alex