[RFC PATCH 10/14] x86: Create virtual memory region for SLUB

From: Matteo Rizzo
Date: Fri Sep 15 2023 - 07:00:45 EST


From: Jann Horn <jannh@xxxxxxxxxx>

SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
struct slab and the actual slab memory. The pointers returned by
kmem_cache_alloc will point to this range of memory.

Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
Co-developed-by: Matteo Rizzo <matteorizzo@xxxxxxxxxx>
Signed-off-by: Matteo Rizzo <matteorizzo@xxxxxxxxxx>
---
Documentation/arch/x86/x86_64/mm.rst | 4 ++--
arch/x86/include/asm/pgtable_64_types.h | 16 ++++++++++++++++
arch/x86/mm/init_64.c | 19 +++++++++++++++----
arch/x86/mm/kaslr.c | 9 +++++++++
arch/x86/mm/mm_internal.h | 4 ++++
mm/slub.c | 4 ++++
security/Kconfig.hardening | 2 ++
7 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/Documentation/arch/x86/x86_64/mm.rst b/Documentation/arch/x86/x86_64/mm.rst
index 35e5e18c83d0..121179537175 100644
--- a/Documentation/arch/x86/x86_64/mm.rst
+++ b/Documentation/arch/x86/x86_64/mm.rst
@@ -57,7 +57,7 @@ Complete virtual memory map with 4-level page tables
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
- fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
+ fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | SLUB virtual memory
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
@@ -116,7 +116,7 @@ Complete virtual memory map with 5-level page tables
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
- fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
+ fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | SLUB virtual memory
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 38b54b992f32..e1a91eb084c4 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -6,6 +6,7 @@

#ifndef __ASSEMBLY__
#include <linux/types.h>
+#include <linux/align.h>
#include <asm/kaslr.h>

/*
@@ -199,6 +200,21 @@ extern unsigned int ptrs_per_p4d;
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)

+#ifdef CONFIG_SLAB_VIRTUAL
+#define SLAB_PGD_ENTRY _AC(-3, UL)
+#define SLAB_BASE_ADDR (SLAB_PGD_ENTRY << P4D_SHIFT)
+#define SLAB_END_ADDR (SLAB_BASE_ADDR + P4D_SIZE)
+
+/*
+ * We need to define this here because we need it to compute SLAB_META_SIZE
+ * and including slab.h causes a dependency cycle.
+ */
+#define STRUCT_SLAB_SIZE (32 * sizeof(void *))
+#define SLAB_VPAGES ((SLAB_END_ADDR - SLAB_BASE_ADDR) / PAGE_SIZE)
+#define SLAB_META_SIZE ALIGN(SLAB_VPAGES * STRUCT_SLAB_SIZE, PAGE_SIZE)
+#define SLAB_DATA_BASE_ADDR (SLAB_BASE_ADDR + SLAB_META_SIZE)
+#endif /* CONFIG_SLAB_VIRTUAL */
+
#define CPU_ENTRY_AREA_PGD _AC(-4, UL)
#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a190aae8ceaf..d716ddfd9880 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1279,16 +1279,19 @@ static void __init register_page_bootmem_info(void)
}

/*
- * Pre-allocates page-table pages for the vmalloc area in the kernel page-table.
+ * Pre-allocates page-table pages for the vmalloc and SLUB areas in the kernel
+ * page-table.
* Only the level which needs to be synchronized between all page-tables is
* allocated because the synchronization can be expensive.
*/
-static void __init preallocate_vmalloc_pages(void)
+static void __init preallocate_top_level_entries_range(unsigned long start,
+ unsigned long end)
{
unsigned long addr;
const char *lvl;

- for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
@@ -1328,6 +1331,14 @@ static void __init preallocate_vmalloc_pages(void)
panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
}

+static void __init preallocate_top_level_entries(void)
+{
+ preallocate_top_level_entries_range(VMALLOC_START, VMEMORY_END);
+#ifdef CONFIG_SLAB_VIRTUAL
+ preallocate_top_level_entries_range(SLAB_BASE_ADDR, SLAB_END_ADDR - 1);
+#endif
+}
+
void __init mem_init(void)
{
pci_iommu_alloc();
@@ -1351,7 +1362,7 @@ void __init mem_init(void)
if (get_gate_vma(&init_mm))
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);

- preallocate_vmalloc_pages();
+ preallocate_top_level_entries();
}

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..7b297d372a8c 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -136,6 +136,15 @@ void __init kernel_randomize_memory(void)
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
+
+#ifdef CONFIG_SLAB_VIRTUAL
+ /*
+ * slub_addr_base is initialized separately from the
+ * kaslr_memory_regions because it comes after CPU_ENTRY_AREA_BASE.
+ */
+ prandom_bytes_state(&rand_state, &rand, sizeof(rand));
+ slub_addr_base += (rand & ((1UL << 36) - PAGE_SIZE));
+#endif
}

void __meminit init_trampoline_kaslr(void)
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 3f37b5c80bb3..fafb79b7e019 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -25,4 +25,8 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);

extern unsigned long tlb_single_page_flush_ceiling;

+#ifdef CONFIG_SLAB_VIRTUAL
+extern unsigned long slub_addr_base;
+#endif
+
#endif /* __X86_MM_INTERNAL_H */
diff --git a/mm/slub.c b/mm/slub.c
index 4f77e5d4fe6c..a731fdc79bff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -166,6 +166,10 @@
* the fast path and disables lockless freelists.
*/

+#ifdef CONFIG_SLAB_VIRTUAL
+unsigned long slub_addr_base = SLAB_DATA_BASE_ADDR;
+#endif /* CONFIG_SLAB_VIRTUAL */
+
/*
* We could simply use migrate_disable()/enable() but as long as it's a
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 9f4e6e38aa76..f4a0af424149 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -357,6 +357,8 @@ config GCC_PLUGIN_RANDSTRUCT

config SLAB_VIRTUAL
bool "Allocate slab objects from virtual memory"
+ # For virtual memory region allocation
+ depends on X86_64
depends on SLUB && !SLUB_TINY
# If KFENCE support is desired, it could be implemented on top of our
# virtual memory allocation facilities
--
2.42.0.459.ge4e396fd5e-goog