[PATCH v3 4/4] RISC-V: Allow booting kernel from any 4KB aligned address

From: Anup Patel
Date: Mon Mar 25 2019 - 05:23:24 EST


Currently, we have to boot RISCV64 kernel from a 2MB aligned physical
address and RISCV32 kernel from a 4MB aligned physical address. This
constraint is because initial pagetable setup (i.e. setup_vm()) maps
entire RAM using hugepages (i.e. 2MB for 3-level pagetable and 4MB for
2-level pagetable).

Further, the above booting contraint also results in memory wastage
because if we boot kernel from some <xyz> address (which is not same as
RAM start address) then RISCV kernel will map PAGE_OFFSET virtual address
lineraly to <xyz> physical address and memory between RAM start and <xyz>
will be reserved/unusable.

For example, RISCV64 kernel booted from 0x80200000 will waste 2MB of RAM
and RISCV32 kernel booted from 0x80400000 will waste 4MB of RAM.

This patch re-writes the initial pagetable setup code to allow booting
RISV32 and RISCV64 kernel from any 4KB (i.e. PAGE_SIZE) aligned address.

To achieve this:
1. We add kconfig option BOOT_PAGE_ALIGNED. When it is enabled we use
4KB mappings in initial page table setup otherwise we use 2MB/4MB
mappings.
2. We map kernel and dtb (few MBs) in setup_vm() (called from head.S)
3. Once we reach paging_init() (called from setup_arch()) after
memblock setup, we map all available memory banks.

With this patch in-place, the booting constraint for RISCV32 and RISCV64
kernel is much more relaxed when CONFIG_BOOT_PAGE_ALIGNED=y and we can
now boot kernel very close to RAM start thereby minimizng memory wastage.

Signed-off-by: Anup Patel <anup.patel@xxxxxxx>
---
arch/riscv/Kconfig | 12 +
arch/riscv/include/asm/fixmap.h | 5 +
arch/riscv/include/asm/pgtable-64.h | 5 +
arch/riscv/include/asm/pgtable.h | 5 +
arch/riscv/kernel/head.S | 1 +
arch/riscv/kernel/setup.c | 4 +-
arch/riscv/mm/init.c | 351 ++++++++++++++++++++++++----
7 files changed, 335 insertions(+), 48 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index eb56c82d8aa1..d7812b1f7c7e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -172,6 +172,18 @@ config SMP

If you don't know what to do here, say N.

+config BOOT_PAGE_ALIGNED
+ bool "Allow booting from page aligned address"
+ default n
+ help
+ This enables support for booting the kernel from any page aligned
+ address (i.e. 4KB aligned). This option is particularly useful on
+ systems with a very small RAM (few MBs) as we can boot the kernel
+ closer to the RAM start address, thereby reducing the amount of
+ unusable RAM below the kernel.
+
+ If you don't know what to do here, say N.
+
config NR_CPUS
int "Maximum number of CPUs (2-32)"
range 2 32
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index c207f6634b91..9c66033c3a54 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -21,6 +21,11 @@
*/
enum fixed_addresses {
FIX_HOLE,
+#define FIX_FDT_SIZE SZ_1M
+ FIX_FDT_END,
+ FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
+ FIX_PTE,
+ FIX_PMD,
FIX_EARLYCON_MEM_BASE,
__end_of_fixed_addresses
};
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 7aa0ea9bd8bb..56ecc3dc939d 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -78,6 +78,11 @@ static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
}

+static inline unsigned long _pmd_pfn(pmd_t pmd)
+{
+ return pmd_val(pmd) >> _PAGE_PFN_SHIFT;
+}
+
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 1141364d990e..c4968b47c37d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -127,6 +127,11 @@ static inline pgd_t pfn_pgd(unsigned long pfn, pgprot_t prot)
return __pgd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
}

+static inline unsigned long _pgd_pfn(pgd_t pgd)
+{
+ return pgd_val(pgd) >> _PAGE_PFN_SHIFT;
+}
+
#define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))

/* Locate an entry in the page global directory */
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 3449671ec867..61e253ae38b4 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -62,6 +62,7 @@ clear_bss_done:

/* Initialize page tables and relocate to virtual addresses */
la sp, init_thread_union + THREAD_SIZE
+ mv a0, s1
call setup_vm
call relocate

diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 540a331d1376..79670458527d 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -30,6 +30,7 @@
#include <linux/sched/task.h>
#include <linux/swiotlb.h>

+#include <asm/fixmap.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -54,7 +55,8 @@ unsigned long boot_cpu_hartid;

void __init parse_dtb(unsigned int hartid, void *dtb)
{
- if (early_init_dt_scan(__va(dtb)))
+ dtb = (void *)fix_to_virt(FIX_FDT) + ((uintptr_t)dtb & ~PAGE_MASK);
+ if (early_init_dt_scan(dtb))
return;

pr_err("No DTB passed to the kernel\n");
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index f9add4381c73..56970dab3727 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1,14 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2012 Regents of the University of California
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
*/

#include <linux/init.h>
@@ -49,13 +42,6 @@ void setup_zero_page(void)
memset((void *)empty_zero_page, 0, PAGE_SIZE);
}

-void __init paging_init(void)
-{
- setup_zero_page();
- local_flush_tlb_all();
- zone_sizes_init();
-}
-
void __init mem_init(void)
{
#ifdef CONFIG_FLATMEM
@@ -152,16 +138,28 @@ EXPORT_SYMBOL(va_pa_offset);
unsigned long pfn_base;
EXPORT_SYMBOL(pfn_base);

+#define MAX_EARLY_MAPPING_SIZE SZ_128M
+
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;

#ifndef __PAGETABLE_PMD_FOLDED
-#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT)
-pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss;
+#if MAX_EARLY_MAPPING_SIZE < PGDIR_SIZE
+#define NUM_SWAPPER_PMDS 1UL
+#else
+#define NUM_SWAPPER_PMDS (MAX_EARLY_MAPPING_SIZE/PGDIR_SIZE)
+#endif
+pmd_t swapper_pmd[PTRS_PER_PMD*NUM_SWAPPER_PMDS] __page_aligned_bss;
pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
+#define NUM_SWAPPER_PTES (MAX_EARLY_MAPPING_SIZE/PMD_SIZE)
+#else
+#define NUM_SWAPPER_PTES (MAX_EARLY_MAPPING_SIZE/PGDIR_SIZE)
#endif

+pte_t swapper_pte[PTRS_PER_PTE*NUM_SWAPPER_PTES] __page_aligned_bss;
pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;

+static uintptr_t map_size;
+
void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
{
unsigned long addr = __fix_to_virt(idx);
@@ -179,6 +177,201 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
}
}

+struct mapping_ops {
+ pte_t *(*get_pte_virt)(phys_addr_t pa);
+ phys_addr_t (*alloc_pte)(uintptr_t va);
+ pmd_t *(*get_pmd_virt)(phys_addr_t pa);
+ phys_addr_t (*alloc_pmd)(uintptr_t va);
+};
+
+static phys_addr_t __init final_alloc_pgtable(void)
+{
+ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static pte_t *__init early_get_pte_virt(phys_addr_t pa)
+{
+ return (pte_t *)((uintptr_t)pa);
+}
+
+static pte_t *__init final_get_pte_virt(phys_addr_t pa)
+{
+ clear_fixmap(FIX_PTE);
+
+ return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
+}
+
+static phys_addr_t __init early_alloc_pte(uintptr_t va)
+{
+ pte_t *base = swapper_pte;
+ uintptr_t pte_num = ((va - PAGE_OFFSET) >> PMD_SHIFT);
+
+ BUG_ON(pte_num >= NUM_SWAPPER_PTES);
+
+ return (uintptr_t)&base[pte_num * PTRS_PER_PTE];
+}
+
+static phys_addr_t __init final_alloc_pte(uintptr_t va)
+{
+ return final_alloc_pgtable();
+}
+
+static void __init create_pte_mapping(pte_t *ptep,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
+{
+ uintptr_t pte_index = pte_index(va);
+
+ BUG_ON(sz != PAGE_SIZE);
+
+ if (pte_none(ptep[pte_index]))
+ ptep[pte_index] = pfn_pte(PFN_DOWN(pa), prot);
+}
+
+#ifndef __PAGETABLE_PMD_FOLDED
+static pmd_t *__init early_get_pmd_virt(phys_addr_t pa)
+{
+ return (pmd_t *)((uintptr_t)pa);
+}
+
+static pmd_t *__init final_get_pmd_virt(phys_addr_t pa)
+{
+ clear_fixmap(FIX_PMD);
+
+ return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
+}
+
+static phys_addr_t __init early_alloc_pmd(uintptr_t va)
+{
+ pmd_t *base = swapper_pmd;
+ uintptr_t pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
+
+ BUG_ON(pmd_num >= NUM_SWAPPER_PMDS);
+
+ return (uintptr_t)&base[pmd_num * PTRS_PER_PMD];
+}
+
+static phys_addr_t __init final_alloc_pmd(uintptr_t va)
+{
+ return final_alloc_pgtable();
+}
+
+static void __init create_pmd_mapping(pmd_t *pmdp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot,
+ struct mapping_ops *ops)
+{
+ pte_t *ptep;
+ phys_addr_t pte_phys;
+ uintptr_t pmd_index = pmd_index(va);
+
+ if (sz == PMD_SIZE) {
+ if (pmd_none(pmdp[pmd_index]))
+ pmdp[pmd_index] = pfn_pmd(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pmd_none(pmdp[pmd_index])) {
+ pte_phys = ops->alloc_pte(va);
+ pmdp[pmd_index] = pfn_pmd(PFN_DOWN(pte_phys),
+ __pgprot(_PAGE_TABLE));
+ ptep = ops->get_pte_virt(pte_phys);
+ memset(ptep, 0, PAGE_SIZE);
+ } else {
+ pte_phys = PFN_PHYS(_pmd_pfn(pmdp[pmd_index]));
+ ptep = ops->get_pte_virt(pte_phys);
+ }
+
+ create_pte_mapping(ptep, va, pa, sz, prot);
+}
+
+
+static void __init create_pgd_mapping(pgd_t *pgdp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot,
+ struct mapping_ops *ops)
+{
+ pmd_t *pmdp;
+ phys_addr_t pmd_phys;
+ uintptr_t pgd_index = pgd_index(va);
+
+ if (sz == PGDIR_SIZE) {
+ if (pgd_val(pgdp[pgd_index]) == 0)
+ pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pgd_val(pgdp[pgd_index]) == 0) {
+ pmd_phys = ops->alloc_pmd(va);
+ pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pmd_phys),
+ __pgprot(_PAGE_TABLE));
+ pmdp = ops->get_pmd_virt(pmd_phys);
+ memset(pmdp, 0, PAGE_SIZE);
+ } else {
+ pmd_phys = PFN_PHYS(_pgd_pfn(pgdp[pgd_index]));
+ pmdp = ops->get_pmd_virt(pmd_phys);
+ }
+
+ create_pmd_mapping(pmdp, va, pa, sz, prot, ops);
+}
+#else
+static void __init create_pgd_mapping(pgd_t *pgdp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot,
+ struct mapping_ops *ops)
+{
+ pte_t *ptep;
+ phys_addr_t pte_phys;
+ uintptr_t pgd_index = pgd_index(va);
+
+ if (sz == PGDIR_SIZE) {
+ if (pgd_val(pgdp[pgd_index]) == 0)
+ pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pgd_val(pgdp[pgd_index]) == 0) {
+ pte_phys = ops->alloc_pte(va);
+ pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pte_phys),
+ __pgprot(_PAGE_TABLE));
+ ptep = ops->get_pte_virt(pte_phys);
+ memset(ptep, 0, PAGE_SIZE);
+ } else {
+ pte_phys = PFN_PHYS(_pgd_pfn(pgdp[pgd_index]));
+ ptep = ops->get_pte_virt(pte_phys);
+ }
+
+ create_pte_mapping(ptep, va, pa, sz, prot);
+}
+#endif
+
+static uintptr_t __init best_map_size(uintptr_t load_pa, phys_addr_t size)
+{
+#ifdef CONFIG_BOOT_PAGE_ALIGNED
+ uintptr_t map_sz = PAGE_SIZE;
+#else
+#ifndef __PAGETABLE_PMD_FOLDED
+ uintptr_t map_sz = PMD_SIZE;
+#else
+ uintptr_t map_sz = PGDIR_SIZE;
+#endif
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ if (!(load_pa & (PMD_SIZE - 1)) &&
+ (size >= PMD_SIZE) &&
+ (map_sz < PMD_SIZE))
+ map_sz = PMD_SIZE;
+#endif
+
+ if (!(load_pa & (PGDIR_SIZE - 1)) &&
+ (size >= PGDIR_SIZE) &&
+ (map_sz < PGDIR_SIZE))
+ map_sz = PGDIR_SIZE;
+
+ return map_sz;
+}
+
/*
* The setup_vm() is called from head.S with MMU-off.
*
@@ -192,46 +385,110 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
* Currently, the above requirements are honoured by using custom CFLAGS
* for init.o in mm/Makefile.
*/
-asmlinkage void __init setup_vm(void)
+asmlinkage void __init setup_vm(uintptr_t dtb_pa)
{
- uintptr_t i;
- uintptr_t pa = (uintptr_t) &_start;
+ uintptr_t va, end_va;
+ uintptr_t load_pa = (uintptr_t)(&_start);
+ uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
+ pgprot_t tableprot = __pgprot(_PAGE_TABLE);
pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
+ struct mapping_ops ops;

- va_pa_offset = PAGE_OFFSET - pa;
- pfn_base = PFN_DOWN(pa);
+ va_pa_offset = PAGE_OFFSET - load_pa;
+ pfn_base = PFN_DOWN(load_pa);
+ map_size = best_map_size(load_pa, PGDIR_SIZE);

/* Sanity check alignment and size */
BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
- BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0);
+ BUG_ON((load_pa % map_size) != 0);
+ BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
+
+ /* Setup swapper ops */
+ ops.get_pte_virt = early_get_pte_virt;
+ ops.alloc_pte = early_alloc_pte;
+ ops.get_pmd_virt = NULL;
+ ops.alloc_pmd = NULL;

#ifndef __PAGETABLE_PMD_FOLDED
- for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
- size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
+ /* Update mapping ops for PMD */
+ ops.get_pmd_virt = early_get_pmd_virt;
+ ops.alloc_pmd = early_alloc_pmd;
+
+ /* Setup swapper PGD and PMD for fixmap */
+ create_pgd_mapping(swapper_pg_dir, FIXADDR_START,
+ (uintptr_t)fixmap_pmd, PGDIR_SIZE, tableprot, &ops);
+ create_pmd_mapping(fixmap_pmd, FIXADDR_START,
+ (uintptr_t)fixmap_pte, PMD_SIZE, tableprot, &ops);
+#else
+ /* Setup swapper PGD for fixmap */
+ create_pgd_mapping(swapper_pg_dir, FIXADDR_START,
+ (uintptr_t)fixmap_pte, PGDIR_SIZE, tableprot, &ops);
+#endif

- swapper_pg_dir[o] =
- pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i,
- __pgprot(_PAGE_TABLE));
- }
- for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++)
- swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot);
-
- swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
- pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pmd),
- __pgprot(_PAGE_TABLE));
- fixmap_pmd[(FIXADDR_START >> PMD_SHIFT) % PTRS_PER_PMD] =
- pfn_pmd(PFN_DOWN((uintptr_t)fixmap_pte),
- __pgprot(_PAGE_TABLE));
+ /*
+ * Setup swapper PGD covering entire kernel which will allows
+ * us to reach paging_init(). We map all memory banks later in
+ * setup_vm_final() below.
+ */
+ end_va = PAGE_OFFSET + load_sz;
+ for (va = PAGE_OFFSET; va < end_va; va += map_size)
+ create_pgd_mapping(swapper_pg_dir, va,
+ load_pa + (va - PAGE_OFFSET),
+ map_size, prot, &ops);
+
+ /* Create fixed mapping for early FDT parsing */
+ end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE;
+ for (va = __fix_to_virt(FIX_FDT); va < end_va; va += PAGE_SIZE)
+ create_pte_mapping(fixmap_pte, va,
+ dtb_pa + (va - __fix_to_virt(FIX_FDT)),
+ PAGE_SIZE, prot);
+}
+
+static void __init setup_vm_final(void)
+{
+ phys_addr_t pa, start, end;
+ struct memblock_region *reg;
+ struct mapping_ops ops;
+ pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
+
+ /* Setup mapping ops */
+ ops.get_pte_virt = final_get_pte_virt;
+ ops.alloc_pte = final_alloc_pte;
+#ifndef __PAGETABLE_PMD_FOLDED
+ ops.get_pmd_virt = final_get_pmd_virt;
+ ops.alloc_pmd = final_alloc_pmd;
#else
- for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
- size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
+ ops.get_pmd_virt = NULL;
+ ops.alloc_pmd = NULL;
+#endif

- swapper_pg_dir[o] =
- pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot);
+ /* Map all memory banks */
+ for_each_memblock(memory, reg) {
+ start = reg->base;
+ end = start + reg->size;
+
+ if (start >= end)
+ break;
+ if (memblock_is_nomap(reg))
+ continue;
+ if (start <= __pa(PAGE_OFFSET) &&
+ __pa(PAGE_OFFSET) < end)
+ start = __pa(PAGE_OFFSET);
+
+ for (pa = start; pa < end; pa += map_size)
+ create_pgd_mapping(swapper_pg_dir,
+ (uintptr_t)__va(pa), pa,
+ map_size, prot, &ops);
}

- swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
- pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pte),
- __pgprot(_PAGE_TABLE));
-#endif
+ clear_fixmap(FIX_PTE);
+ clear_fixmap(FIX_PMD);
+}
+
+void __init paging_init(void)
+{
+ setup_vm_final();
+ setup_zero_page();
+ local_flush_tlb_all();
+ zone_sizes_init();
}
--
2.17.1