[PATCH] x86/mm/KASLR: Do not adapt the size of the direct mapping section for SGI UV system

From: Baoquan He
Date: Thu May 18 2017 - 02:53:23 EST


On SGI UV system, kernel casually hang with kaslr enabled.

The back trace is:

kernel BUG at arch/x86/mm/init_64.c:311!
invalid opcode: 0000 [#1] SMP
[...]
RIP: 0010:__init_extra_mapping+0x188/0x196
[...]
Call Trace:
init_extra_mapping_uc+0x13/0x15
map_high+0x67/0x75
map_mmioh_high_uv3+0x20a/0x219
uv_system_init_hub+0x12d9/0x1496
uv_system_init+0x27/0x29
native_smp_prepare_cpus+0x28d/0x2d8
kernel_init_freeable+0xdd/0x253
? rest_init+0x80/0x80
kernel_init+0xe/0x110
ret_from_fork+0x2c/0x40

The root cause is SGI UV system needs map its MMIOH region to direct
mapping section.

When kaslr disabled, there are 64TB space for system RAM to do direct
mapping. Both system RAM and SGI UV MMIOH region share this 64TB space.
However with kaslr enabled, mm KASLR only reserves the actual size of
system RAM plus 10TB for direct mapping usage. Then MMIOH mapping of
SGI UV could go beyond the upper bound of direct mapping section to step
into VMALLOC or VMEMMAP area. Then the BUG_ON() in __init_extra_mapping()
will be triggered.

E.g on the SGI UV3 machine where this bug is reported , there are two MMIOH
regions:

[ 1.519001] UV: Map MMIOH0_HI 0xffc00000000 - 0x100000000000
[ 1.523001] UV: Map MMIOH1_HI 0x100000000000 - 0x200000000000

They are [16TB-16G, 16TB) and [16TB, 32TB). On this machine, 512G ram are
spread out to 1TB regions. Then above two SGI MMIOH regions also will be
mapped into the direct mapping section. The point is that SGI UV maps its
MMIOH regions to the direct mapping section in uv_system_init() which is
called during kernel_init_freeable(). It's much later than
kernel_randomize_memory(). Mm KASLR has no chance to take it into
consideration.

To fix it, we need detect if it's SGI UV system early, then do not adapt
the size of the direct mapping section in kernel_randomize_memory() if yes.
According to discussion with SGI developer, "The SGI bios adds UVsystab.
Only systems running SGI bios (and now HPE Hawks2) will have UVsystab."

Hence check if UVsystab is present, if yes, do not adapt the size of the
direct mapping section in kernel_randomize_memory()

Signed-off-by: Baoquan He <bhe@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: x86@xxxxxxxxxx
Cc: Matt Fleming <matt@xxxxxxxxxxxxxxxxxxx>
Cc: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
Cc: "travis@xxxxxxx" <travis@xxxxxxx>
Cc: Dimitri Sivanich <sivanich@xxxxxxx>
Cc: Thomas Garnier <thgarnie@xxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Masahiro Yamada <yamada.masahiro@xxxxxxxxxxxxx>
Cc: Russ Anderson <rja@xxxxxxx>
Cc: Frank Ramsay <frank.ramsay@xxxxxxx>
linux-efi@xxxxxxxxxxxxxxx
---
arch/x86/include/asm/uv/uv.h | 2 ++
arch/x86/mm/kaslr.c | 4 +++-
arch/x86/platform/efi/efi.c | 7 +++++++
3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6686820..0275776 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -11,6 +11,7 @@ struct mm_struct;
extern enum uv_system_type get_uv_system_type(void);
extern int is_uv_system(void);
extern int is_uv_hubless(void);
+extern int is_early_uv_system(void);
extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
@@ -25,6 +26,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
static inline int is_uv_system(void) { return 0; }
static inline int is_uv_hubless(void) { return 0; }
+static inline int is_early_uv_system(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
static inline const struct cpumask *
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed2064..b75e1f5 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -22,11 +22,13 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/efi.h>

#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/kaslr.h>
+#include <asm/uv/uv.h>

#include "mm_internal.h"

@@ -123,7 +125,7 @@ void __init kernel_randomize_memory(void)
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;

/* Adapt phyiscal memory region size based on available memory */
- if (memory_tb < kaslr_regions[0].size_tb)
+ if (memory_tb < kaslr_regions[0].size_tb && !is_early_uv_system())
kaslr_regions[0].size_tb = memory_tb;

/* Calculate entropy available between regions */
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 7e76a4d..b71e01d 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -74,6 +74,13 @@ static int __init setup_add_efi_memmap(char *arg)
}
early_param("add_efi_memmap", setup_add_efi_memmap);

+#ifdef CONFIG_X86_UV
+int is_early_uv_system(void)
+{
+ return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab);
+}
+#endif
+
static efi_status_t __init phys_efi_set_virtual_address_map(
unsigned long memory_map_size,
unsigned long descriptor_size,
--
2.5.5