Re: [PATCH v2 4/8] x86, efi: Reserve UEFI 2.8 Specific Purpose Memory for dax

From: Mike Rapoport
Date: Mon Jun 03 2019 - 01:47:17 EST


Hi Dan,

On Thu, May 30, 2019 at 03:59:43PM -0700, Dan Williams wrote:
> UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the
> interpretation of the EFI Memory Types as "reserved for a special
> purpose".
>
> The proposed Linux behavior for specific purpose memory is that it is
> reserved for direct-access (device-dax) by default and not available for
> any kernel usage, not even as an OOM fallback. Later, through udev
> scripts or another init mechanism, these device-dax claimed ranges can
> be reconfigured and hot-added to the available System-RAM with a unique
> node identifier.
>
> This patch introduces 3 new concepts at once given the entanglement
> between early boot enumeration relative to memory that can optionally be
> reserved from the kernel page allocator by default. The new concepts
> are:
>
> - E820_TYPE_SPECIFIC: Upon detecting the EFI_MEMORY_SP attribute on
> EFI_CONVENTIONAL memory, update the E820 map with this new type. Only
> perform this classification if the CONFIG_EFI_SPECIFIC_DAX=y policy is
> enabled, otherwise treat it as typical ram.
>
> - IORES_DESC_APPLICATION_RESERVED: Add a new I/O resource descriptor for
> a device driver to search iomem resources for application specific
> memory. Teach the iomem code to identify such ranges as "Application
> Reserved".
>
> - MEMBLOCK_APP_SPECIFIC: Given the memory ranges can fallback to the
> traditional System RAM pool the expectation is that they will have
> typical SRAT entries. In order to support a policy of device-dax by
> default with the option to hotplug later, the numa initialization code
> is taught to avoid marking online MEMBLOCK_APP_SPECIFIC regions.

I'd appreciate a more elaborate description how this flag is going to be
used.

> A follow-on change integrates parsing of the ACPI HMAT to identify the
> node and sub-range boundaries of EFI_MEMORY_SP designated memory. For
> now, just identify and reserve memory of this type.
>
> Cc: <x86@xxxxxxxxxx>
> Cc: Borislav Petkov <bp@xxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
> Cc: Darren Hart <dvhart@xxxxxxxxxxxxx>
> Cc: Andy Shevchenko <andy@xxxxxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
> Reported-by: kbuild test robot <lkp@xxxxxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> ---
> arch/x86/Kconfig | 20 ++++++++++++++++++++
> arch/x86/boot/compressed/eboot.c | 5 ++++-
> arch/x86/boot/compressed/kaslr.c | 2 +-
> arch/x86/include/asm/e820/types.h | 9 +++++++++
> arch/x86/kernel/e820.c | 9 +++++++--
> arch/x86/kernel/setup.c | 1 +
> arch/x86/platform/efi/efi.c | 37 +++++++++++++++++++++++++++++++++----
> drivers/acpi/numa.c | 15 ++++++++++++++-
> include/linux/efi.h | 14 ++++++++++++++
> include/linux/ioport.h | 1 +
> include/linux/memblock.h | 7 +++++++
> mm/memblock.c | 4 ++++
> 12 files changed, 115 insertions(+), 9 deletions(-)

...

> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 08a5f4a131f5..ddde1c7b1f9a 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -1109,6 +1109,7 @@ void __init setup_arch(char **cmdline_p)
>
> if (efi_enabled(EFI_MEMMAP)) {
> efi_fake_memmap();
> + efi_find_app_specific();
> efi_find_mirror();
> efi_esrt_init();
>
> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> index e1cb01a22fa8..899f1305c77a 100644
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -123,10 +123,15 @@ void __init efi_find_mirror(void)
> * more than the max 128 entries that can fit in the e820 legacy
> * (zeropage) memory map.
> */
> +enum add_efi_mode {
> + ADD_EFI_ALL,
> + ADD_EFI_APP_SPECIFIC,
> +};
>
> -static void __init do_add_efi_memmap(void)
> +static void __init do_add_efi_memmap(enum add_efi_mode mode)
> {
> efi_memory_desc_t *md;
> + int add = 0;
>
> for_each_efi_memory_desc(md) {
> unsigned long long start = md->phys_addr;
> @@ -139,7 +144,9 @@ static void __init do_add_efi_memmap(void)
> case EFI_BOOT_SERVICES_CODE:
> case EFI_BOOT_SERVICES_DATA:
> case EFI_CONVENTIONAL_MEMORY:
> - if (md->attribute & EFI_MEMORY_WB)
> + if (is_efi_dax(md))
> + e820_type = E820_TYPE_SPECIFIC;
> + else if (md->attribute & EFI_MEMORY_WB)
> e820_type = E820_TYPE_RAM;
> else
> e820_type = E820_TYPE_RESERVED;
> @@ -165,9 +172,24 @@ static void __init do_add_efi_memmap(void)
> e820_type = E820_TYPE_RESERVED;
> break;
> }
> +
> + if (e820_type == E820_TYPE_SPECIFIC) {
> + memblock_remove(start, size);
> + memblock_add_range(&memblock.reserved, start, size,
> + MAX_NUMNODES, MEMBLOCK_APP_SPECIFIC);

Why cannot this happen at e820__memblock_setup()?
Then memblock_remove() call should not be required as nothing will
memblock_add() the region.

> + } else if (mode != ADD_EFI_APP_SPECIFIC)
> + continue;
> +
> + add++;
> e820__range_add(start, size, e820_type);
> }
> - e820__update_table(e820_table);
> + if (add)
> + e820__update_table(e820_table);
> +}
> +
> +void __init efi_find_app_specific(void)
> +{
> + do_add_efi_memmap(ADD_EFI_APP_SPECIFIC);
> }
>
> int __init efi_memblock_x86_reserve_range(void)
> @@ -200,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void)
> return rv;
>
> if (add_efi_memmap)
> - do_add_efi_memmap();
> + do_add_efi_memmap(ADD_EFI_ALL);
>
> WARN(efi.memmap.desc_version != 1,
> "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
> @@ -753,6 +775,13 @@ static bool should_map_region(efi_memory_desc_t *md)
> if (IS_ENABLED(CONFIG_X86_32))
> return false;
>
> + /*
> + * Specific purpose memory assigned to device-dax is
> + * not mapped by default.
> + */
> + if (is_efi_dax(md))
> + return false;
> +
> /*
> * Map all of RAM so that we can access arguments in the 1:1
> * mapping when making EFI runtime calls.
> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
> index 30995834ad70..9083bb8f611b 100644
> --- a/drivers/acpi/numa.c
> +++ b/drivers/acpi/numa.c
> @@ -260,7 +260,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
> int __init
> acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
> {
> - u64 start, end;
> + u64 start, end, i, a_start, a_end;
> u32 hotpluggable;
> int node, pxm;
>
> @@ -283,6 +283,19 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
> if (acpi_srat_revision <= 1)
> pxm &= 0xff;
>
> + /* Clamp Application Specific Memory */
> + for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE,
> + MEMBLOCK_APP_SPECIFIC, &a_start, &a_end, NULL) {
> + pr_debug("%s: SP: %#llx %#llx SRAT: %#llx %#llx\n", __func__,
> + a_start, a_end, start, end);
> + if (a_start <= start && a_end >= end)
> + goto out_err;
> + if (a_start >= start && a_start < end)
> + start = a_start;
> + if (a_end <= end && end > start)
> + end = a_end;
> + }
> +
> node = acpi_map_pxm_to_node(pxm);
> if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) {
> pr_err("SRAT: Too many proximity domains.\n");
> diff --git a/include/linux/efi.h b/include/linux/efi.h
> index 91368f5ce114..b57b123cbdf9 100644
> --- a/include/linux/efi.h
> +++ b/include/linux/efi.h
> @@ -129,6 +129,19 @@ typedef struct {
> u64 attribute;
> } efi_memory_desc_t;
>
> +#ifdef CONFIG_EFI_SPECIFIC_DAX
> +static inline bool is_efi_dax(efi_memory_desc_t *md)
> +{
> + return md->type == EFI_CONVENTIONAL_MEMORY
> + && (md->attribute & EFI_MEMORY_SP);
> +}
> +#else
> +static inline bool is_efi_dax(efi_memory_desc_t *md)
> +{
> + return false;
> +}
> +#endif
> +
> typedef struct {
> efi_guid_t guid;
> u32 headersize;
> @@ -1043,6 +1056,7 @@ extern efi_status_t efi_query_variable_store(u32 attributes,
> unsigned long size,
> bool nonblocking);
> extern void efi_find_mirror(void);
> +extern void efi_find_app_specific(void);
> #else
>
> static inline efi_status_t efi_query_variable_store(u32 attributes,
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index da0ebaec25f0..2d79841ee9b9 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -133,6 +133,7 @@ enum {
> IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
> IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
> IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
> + IORES_DESC_APPLICATION_RESERVED = 8,
> };
>
> /* helpers to define resources */
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 676d3900e1bd..58c29180f2cd 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -35,12 +35,14 @@ extern unsigned long long max_possible_pfn;
> * @MEMBLOCK_HOTPLUG: hotpluggable region
> * @MEMBLOCK_MIRROR: mirrored region
> * @MEMBLOCK_NOMAP: don't add to kernel direct mapping
> + * @MEMBLOCK_APP_SPECIFIC: reserved / application specific range
> */
> enum memblock_flags {
> MEMBLOCK_NONE = 0x0, /* No special request */
> MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
> MEMBLOCK_MIRROR = 0x2, /* mirrored region */
> MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
> + MEMBLOCK_APP_SPECIFIC = 0x8, /* reserved / application specific range */
> };
>
> /**
> @@ -215,6 +217,11 @@ static inline bool memblock_is_mirror(struct memblock_region *m)
> return m->flags & MEMBLOCK_MIRROR;
> }
>
> +static inline bool memblock_is_app_specific(struct memblock_region *m)
> +{
> + return m->flags & MEMBLOCK_APP_SPECIFIC;
> +}
> +
> static inline bool memblock_is_nomap(struct memblock_region *m)
> {
> return m->flags & MEMBLOCK_NOMAP;
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 6bbad46f4d2c..654fecb52ba5 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -982,6 +982,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags)
> if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
> return true;
>
> + /* if we want specific memory skip non-specific memory regions */
> + if ((flags & MEMBLOCK_APP_SPECIFIC) && !memblock_is_app_specific(m))
> + return true;
> +

With this the MEMBLOCK_APP_SPECIFIC won't be skipped for traversals that
don't set memblock_flags explicitly. Is this the intention?

> /* skip nomap memory unless we were asked for it explicitly */
> if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
> return true;
>

--
Sincerely yours,
Mike.