Re: [PATCH v2 5/8] riscv: Implement sv48 support

From: Alex Ghiti
Date: Sat Jun 27 2020 - 10:26:43 EST


Hi Nick,

Le 6/27/20 Ã 8:30 AM, Nick Kossifidis a ÃcritÂ:
ÎÏÎÏ 2020-06-03 11:11, Alexandre Ghiti ÎÎÏÎÏÎ:
By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that roughly
offers ~160TB of virtual address space to userspace and allows up to 64TB
of physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti <alex@xxxxxxxx>
Reviewed-by: Anup Patel <anup@xxxxxxxxxxxxxx>
---
Âarch/riscv/KconfigÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ |ÂÂ 6 +-
Âarch/riscv/include/asm/csr.hÂÂÂÂÂÂÂ |ÂÂ 3 +-
Âarch/riscv/include/asm/fixmap.hÂÂÂÂ |ÂÂ 1 +
Âarch/riscv/include/asm/page.hÂÂÂÂÂÂ |Â 15 +++
Âarch/riscv/include/asm/pgalloc.hÂÂÂ |Â 36 +++++++
Âarch/riscv/include/asm/pgtable-64.h |Â 97 ++++++++++++++++-
Âarch/riscv/include/asm/pgtable.hÂÂÂ |Â 10 +-
Âarch/riscv/kernel/head.SÂÂÂÂÂÂÂÂÂÂÂ |ÂÂ 3 +-
Âarch/riscv/mm/context.cÂÂÂÂÂÂÂÂÂÂÂÂ |ÂÂ 2 +-
Âarch/riscv/mm/init.cÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ | 158 +++++++++++++++++++++++++---
Â10 files changed, 307 insertions(+), 24 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e167f16131f4..3f73f60e9732 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -68,6 +68,7 @@ config RISCV
ÂÂÂÂ select ARCH_HAS_GCOV_PROFILE_ALL
ÂÂÂÂ select HAVE_COPY_THREAD_TLS
ÂÂÂÂ select HAVE_ARCH_KASAN if MMU && 64BIT
+ÂÂÂ select RELOCATABLE if 64BIT

Âconfig ARCH_MMAP_RND_BITS_MIN
ÂÂÂÂ default 18 if 64BIT
@@ -106,7 +107,7 @@ config PAGE_OFFSET
ÂÂÂÂ default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
ÂÂÂÂ default 0x80000000 if 64BIT && !MMU
ÂÂÂÂ default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
-ÂÂÂ default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
+ÂÂÂ default 0xffffc00000000000 if 64BIT && !MAXPHYSMEM_2GB

Âconfig ARCH_FLATMEM_ENABLE
ÂÂÂÂ def_bool y
@@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
Âconfig FIX_EARLYCON_MEM
ÂÂÂÂ def_bool MMU

+# On a 64BIT relocatable kernel, the 4-level page table is at runtime folded
+# on a 3-level page table when sv48 is not supported.
Âconfig PGTABLE_LEVELS
ÂÂÂÂ int
+ÂÂÂ default 4 if 64BIT && RELOCATABLE
ÂÂÂÂ default 3 if 64BIT
ÂÂÂÂ default 2

diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index cec462e198ce..d41536c3f8d4 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -40,11 +40,10 @@
Â#ifndef CONFIG_64BIT
Â#define SATP_PPNÂÂÂ _AC(0x003FFFFF, UL)
Â#define SATP_MODE_32ÂÂÂ _AC(0x80000000, UL)
-#define SATP_MODEÂÂÂ SATP_MODE_32
Â#else
Â#define SATP_PPNÂÂÂ _AC(0x00000FFFFFFFFFFF, UL)
Â#define SATP_MODE_39ÂÂÂ _AC(0x8000000000000000, UL)
-#define SATP_MODEÂÂÂ SATP_MODE_39
+#define SATP_MODE_48ÂÂÂ _AC(0x9000000000000000, UL)
Â#endif

Â/* Exception cause high bit - is an interrupt if set */
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 2368d49eb4ef..d891cf9c73c5 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -27,6 +27,7 @@ enum fixed_addresses {
ÂÂÂÂ FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
ÂÂÂÂ FIX_PTE,
ÂÂÂÂ FIX_PMD,
+ÂÂÂ FIX_PUD,
ÂÂÂÂ FIX_TEXT_POKE1,
ÂÂÂÂ FIX_TEXT_POKE0,
ÂÂÂÂ FIX_EARLYCON_MEM_BASE,
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 48bb09b6a9b7..5e77fe7f0d6d 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -31,7 +31,19 @@
 * When not using MMU this corresponds to the first free page in
 * physical memory (aligned on a page boundary).
 */
+#ifdef CONFIG_RELOCATABLE
+#define PAGE_OFFSETÂÂÂÂÂÂÂ __page_offset
+
+#ifdef CONFIG_64BIT
+/*
+ * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
+ * define the PAGE_OFFSET value for SV39.
+ */
+#define PAGE_OFFSET_L3ÂÂÂÂÂÂÂ 0xffffffe000000000
+#endif /* CONFIG_64BIT */
+#else
Â#define PAGE_OFFSETÂÂÂÂÂÂÂ _AC(CONFIG_PAGE_OFFSET, UL)
+#endif /* CONFIG_RELOCATABLE */

Â#define KERN_VIRT_SIZE (-PAGE_OFFSET)

@@ -102,6 +114,9 @@ extern unsigned long pfn_base;
Âextern unsigned long max_low_pfn;
Âextern unsigned long min_low_pfn;
Âextern unsigned long kernel_virt_addr;
+#ifdef CONFIG_RELOCATABLE
+extern unsigned long __page_offset;
+#endif

Â#define __pa_to_va_nodebug(x)ÂÂÂ ((void *)((unsigned long) (x) + va_pa_offset))
Â#define linear_mapping_va_to_pa(x)ÂÂÂ ((unsigned long)(x) - va_pa_offset)
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 3f601ee8233f..540eaa5a8658 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct
*mm, pud_t *pud, pmd_t *pmd)

ÂÂÂÂ set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
Â}
+
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
+{
+ÂÂÂ if (pgtable_l4_enabled) {
+ÂÂÂÂÂÂÂ unsigned long pfn = virt_to_pfn(pud);
+
+ÂÂÂÂÂÂÂ set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ÂÂÂ }
+}
+
+static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pud_t *pud)
+{
+ÂÂÂ if (pgtable_l4_enabled) {
+ÂÂÂÂÂÂÂ unsigned long pfn = virt_to_pfn(pud);
+
+ÂÂÂÂÂÂÂ set_p4d_safe(p4d,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ÂÂÂ }
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return (pud_t *)__get_free_page(
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
+ÂÂÂ return NULL;
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ free_page((unsigned long)pud);
+}
+
+#define __pud_free_tlb(tlb, pud, addr)Â pud_free((tlb)->mm, pud)
Â#endif /* __PAGETABLE_PMD_FOLDED */

Â#define pmd_pgtable(pmd)ÂÂÂ pmd_page(pmd)
diff --git a/arch/riscv/include/asm/pgtable-64.h
b/arch/riscv/include/asm/pgtable-64.h
index b15f70a1fdfa..c84c31fbf8da 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -8,16 +8,32 @@

Â#include <linux/const.h>

-#define PGDIR_SHIFTÂÂÂÂ 30
+extern bool pgtable_l4_enabled;
+
+#define PGDIR_SHIFTÂÂÂÂ (pgtable_l4_enabled ? 39 : 30)
Â/* Size of region mapped by a page global directory */
Â#define PGDIR_SIZEÂÂÂÂÂ (_AC(1, UL) << PGDIR_SHIFT)
Â#define PGDIR_MASKÂÂÂÂÂ (~(PGDIR_SIZE - 1))

+/* pud is folded into pgd in case of 3-level page table */
+#define PUD_SHIFTÂÂÂ 30
+#define PUD_SIZEÂÂÂ (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASKÂÂÂ (~(PUD_SIZE - 1))
+
Â#define PMD_SHIFTÂÂÂÂÂÂ 21
Â/* Size of region mapped by a page middle directory */
Â#define PMD_SIZEÂÂÂÂÂÂÂ (_AC(1, UL) << PMD_SHIFT)
Â#define PMD_MASKÂÂÂÂÂÂÂ (~(PMD_SIZE - 1))

+/* Page Upper Directory entry */
+typedef struct {
+ÂÂÂ unsigned long pud;
+} pud_t;
+
+#define pud_val(x)ÂÂÂÂÂ ((x).pud)
+#define __pud(x)ÂÂÂÂÂÂÂ ((pud_t) { (x) })
+#define PTRS_PER_PUDÂÂÂ (PAGE_SIZE / sizeof(pud_t))
+
Â/* Page Middle Directory entry */
Âtypedef struct {
ÂÂÂÂ unsigned long pmd;
@@ -60,6 +76,16 @@ static inline void pud_clear(pud_t *pudp)
ÂÂÂÂ set_pud(pudp, __pud(0));
Â}

+static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
+{
+ÂÂÂ return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
+}
+
+static inline unsigned long _pud_pfn(pud_t pud)
+{
+ÂÂÂ return pud_val(pud) >> _PAGE_PFN_SHIFT;
+}
+
Âstatic inline unsigned long pud_page_vaddr(pud_t pud)
Â{
ÂÂÂÂ return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
@@ -70,6 +96,15 @@ static inline struct page *pud_page(pud_t pud)
ÂÂÂÂ return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
Â}

+#define mm_pud_foldedÂÂÂ mm_pud_folded
+static inline bool mm_pud_folded(struct mm_struct *mm)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return false;
+
+ÂÂÂ return true;
+}
+
Â#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))

Âstatic inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
@@ -90,4 +125,64 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
Â#define pmd_ERROR(e) \
ÂÂÂÂ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))

+#define pud_ERROR(e)ÂÂÂ \
+ÂÂÂ pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ *p4dp = p4d;
+ÂÂÂ else
+ÂÂÂÂÂÂÂ set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
+}
+
+static inline int p4d_none(p4d_t p4d)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return (p4d_val(p4d) == 0);
+
+ÂÂÂ return 0;
+}
+
+static inline int p4d_present(p4d_t p4d)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return (p4d_val(p4d) & _PAGE_PRESENT);
+
+ÂÂÂ return 1;
+}
+
+static inline int p4d_bad(p4d_t p4d)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return !p4d_present(p4d);
+
+ÂÂÂ return 0;
+}
+
+static inline void p4d_clear(p4d_t *p4d)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ set_p4d(p4d, __p4d(0));
+}
+
+static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return (unsigned long)pfn_to_virt(
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ p4d_val(p4d) >> _PAGE_PFN_SHIFT);
+
+ÂÂÂ return pud_page_vaddr((pud_t) { p4d_val(p4d) });
+}
+
+#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+{
+ÂÂÂ if (pgtable_l4_enabled)
+ÂÂÂÂÂÂÂ return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
+
+ÂÂÂ return (pud_t *)p4d;
+}
+

In my test I had to put
#define pud_offset pud_offset
here or else I got a compilation error due to pud_offset being redefined on include/linux/pgtable.h:

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
ÂÂÂÂÂÂÂ return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

Yes, the rebase on 5.8-rc2 requires that and removing pmd_offset definition.

Alex