[PATCH v2 3/3] x86/devmem: Restrict /dev/mem access for potentially unaccepted memory by default
From: Dan Williams
Date: Thu Apr 10 2025 - 21:24:22 EST
Nikolay reports [1] that accessing BIOS data (first 1MB of the physical
address space) via /dev/mem results in an SEPT violation.
The cause is ioremap() (via xlate_dev_mem_ptr()) establishes an
unencrypted mapping where the kernel had established an encrypted
mapping previously.
An initial attempt to fix this revealed that TDX and SEV-SNP have
different expectations about which and when address ranges can be mapped
via /dev/mem.
Rather than develop a precise set of allowed /dev/mem capable TVM
address ranges, teach devmem_is_allowed() to always restrict access to
the BIOS data space. This means return 0s for read(), drop write(), and
-EPERM mmap(). This can still be later relaxed as specific needs arise,
but in the meantime, close off this source of mismatched
IORES_MAP_ENCRYPTED expectations.
Cc: <x86@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Vishal Annapurve <vannapurve@xxxxxxxxxx>
Cc: Kirill Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Reported-by: Nikolay Borisov <nik.borisov@xxxxxxxx>
Closes: http://lore.kernel.org/20250318113604.297726-1-nik.borisov@xxxxxxxx [1]
Reviewed-by: Nikolay Borisov <nik.borisov@xxxxxxxx>
Fixes: 9aa6ea69852c ("x86/tdx: Make pages shared in ioremap()")
Cc: <stable@xxxxxxxxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
arch/x86/Kconfig | 2 ++
arch/x86/include/asm/x86_init.h | 2 ++
arch/x86/kernel/x86_init.c | 6 ++++++
arch/x86/mm/init.c | 23 +++++++++++++++++------
4 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b9f378e05f6..12a1b5acd55b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -891,6 +891,7 @@ config INTEL_TDX_GUEST
depends on X86_X2APIC
depends on EFI_STUB
depends on PARAVIRT
+ depends on STRICT_DEVMEM
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE
@@ -1510,6 +1511,7 @@ config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
depends on EFI_STUB
+ depends on STRICT_DEVMEM
select DMA_COHERENT_POOL
select ARCH_USE_MEMREMAP_PROT
select INSTRUCTION_DECODER
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 213cf5379a5a..0ae436b34b88 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -305,6 +305,7 @@ struct x86_hyper_runtime {
* semantics.
* @realmode_reserve: reserve memory for realmode trampoline
* @realmode_init: initialize realmode trampoline
+ * @devmem_is_allowed restrict /dev/mem and PCI sysfs resource access
* @hyper: x86 hypervisor specific runtime callbacks
*/
struct x86_platform_ops {
@@ -323,6 +324,7 @@ struct x86_platform_ops {
void (*set_legacy_features)(void);
void (*realmode_reserve)(void);
void (*realmode_init)(void);
+ bool (*devmem_is_allowed)(unsigned long pfn);
struct x86_hyper_runtime hyper;
struct x86_guest guest;
};
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 0a2bbd674a6d..346301375bd4 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -143,6 +143,11 @@ static void enc_kexec_begin_noop(void) {}
static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
+static bool platform_devmem_is_allowed(unsigned long pfn)
+{
+ return !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT);
+}
+
struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu_early,
.calibrate_tsc = native_calibrate_tsc,
@@ -156,6 +161,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.restore_sched_clock_state = tsc_restore_sched_clock_state,
.realmode_reserve = reserve_real_mode,
.realmode_init = init_real_mode,
+ .devmem_is_allowed = platform_devmem_is_allowed,
.hyper.pin_vcpu = x86_op_int_noop,
.hyper.is_private_mmio = is_private_mmio_noop,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bfa444a7dbb0..df5435c8dbea 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -861,18 +861,23 @@ void __init poking_init(void)
* area traditionally contains BIOS code and data regions used by X, dosemu,
* and similar apps. Since they map the entire memory range, the whole range
* must be allowed (for mapping), but any areas that would otherwise be
- * disallowed are flagged as being "zero filled" instead of rejected.
+ * disallowed are flagged as being "zero filled" instead of rejected, for
+ * read()/write().
+ *
* Access has to be given to non-kernel-ram areas as well, these contain the
* PCI mmio resources as well as potential bios/acpi data regions.
*/
int devmem_is_allowed(unsigned long pagenr)
{
+ bool platform_allowed = x86_platform.devmem_is_allowed(pagenr);
+
if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
!= REGION_DISJOINT) {
/*
- * For disallowed memory regions in the low 1MB range,
- * request that the page be shown as all zeros.
+ * For disallowed memory regions in the low 1MB range, request
+ * that the page be shown as all zeros for read()/write(), fail
+ * mmap()
*/
if (pagenr < 256)
return 2;
@@ -885,14 +890,20 @@ int devmem_is_allowed(unsigned long pagenr)
* restricted resource under CONFIG_STRICT_DEVMEM.
*/
if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
- /* Low 1MB bypasses iomem restrictions. */
- if (pagenr < 256)
+ /*
+ * Low 1MB bypasses iomem restrictions unless the platform says
+ * the physical address is not suitable for direct access.
+ */
+ if (pagenr < 256) {
+ if (!platform_allowed)
+ return 2;
return 1;
+ }
return 0;
}
- return 1;
+ return platform_allowed;
}
void free_init_pages(const char *what, unsigned long begin, unsigned long end)