[PATCH v2 16/18] memblock: make HugeTLB bootmem allocation work with KHO

From: Pratyush Yadav

Date: Fri Jun 05 2026 - 14:41:04 EST

From: "Pratyush Yadav (Google)" <pratyush@xxxxxxxxxx>

Gigantic huge page allocation is somewhat broken currently when KHO is
used.

Firstly, they break KHO scratch size accounting. RSRV_KERN is used to
track how much memory is reserved for use by the kernel. Since
alloc_bootmem() calls the memblock_alloc*() APIs, the hugepages
allocated also get marked as RSRV_KERN.

Allocations marked RSRV_KERN are used by KHO to calculate how much
scratch space it should reserve to make sure the next kernel has enough
memory to boot when it is in scratch-only phase. Counting hugepages in
that blows up scratch size, and can lead to the scratch allocation
failing, making KHO unusable. This will show up when huge pages make up
more than 50% of the system, which is a fairly common use case.

Secondly, while not supported right now, huge pages are user memory and
can be preserved via KHO. The scratch spaces should not have any
preserved memory. Allocating hugepages from scratch (on a KHO boot) can
lead to them being un-preservable.

Introduce memblock_alloc_hugetlb(). This lets memblock tailor to the
needs of hugetb without exposing those details to the general allocation
routines.

First, it does not use mirrored memory for hugetlb. Mirrored memory is a
limited resource that is best saved for kernel data structures, not user
memory.

Second, if the memory found overlaps with KHO scratch areas, it discards
the memory and retries.

Third, it simplifies the argument list by baking in some hugetlb
assumptions like alignment and exact_nid. This also simplifies
allocation logic in alloc_bootmem().

Also introduce MEMBLOCK_RSRV_HUGETLB to mark reservations made for
HugeTLB. This will be used by KHO in future patches to correctly
calculate scratch sizes.

Refactor some of the preparation logic like kmemleak tracking and
accepting memory into a separate helper memblock_prep_allocation(), and
use it from both memblock_alloc_hugetlb() and the usual
memblock_alloc_range_nid().

Signed-off-by: Pratyush Yadav (Google) <pratyush@xxxxxxxxxx>
---
include/linux/memblock.h | 3 ++
mm/hugetlb.c | 22 +++-----
mm/memblock.c | 112 +++++++++++++++++++++++++++++++--------
3 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 546d7ef798b8..b3b4a6145fad 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -52,6 +52,7 @@ extern unsigned long long max_possible_pfn;
* memory reservations yet, so we get scratch memory from the previous
* kernel that we know is good to use. It is the only memory that
* allocations may happen from in this phase.
+ * @MEMBLOCK_RSRV_HUGETLB: memory is reserved for hugetlb pages
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
@@ -62,6 +63,7 @@ enum memblock_flags {
MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */
MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */
MEMBLOCK_KHO_SCRATCH = 0x40, /* scratch memory for kexec handover */
+ MEMBLOCK_RSRV_HUGETLB = 0x80, /* memory reserved for hugetlb pages */
};

/**
@@ -421,6 +423,7 @@ void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid);
+void *memblock_alloc_hugetlb(phys_addr_t size, int nid, bool exact_nid);

static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4b80b167cc9c..fadcfa267ceb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3029,29 +3029,21 @@ static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
if (hugetlb_early_cma(h))
m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
else {
- if (node_exact)
- m = memblock_alloc_exact_nid_raw(huge_page_size(h),
- huge_page_size(h), 0,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
- else {
- m = memblock_alloc_try_nid_raw(huge_page_size(h),
- huge_page_size(h), 0,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ m = memblock_alloc_hugetlb(huge_page_size(h), nid, node_exact);
+ if (m) {
+ m->flags = 0;
+ m->cma = NULL;
+
/*
* For pre-HVO to work correctly, pages need to be on
* the list for the node they were actually allocated
* from. That node may be different in the case of
- * fallback by memblock_alloc_try_nid_raw. So,
+ * fallback by memblock_alloc_hugetlb_bootmem. So,
* extract the actual node first.
*/
- if (m)
+ if (!node_exact)
listnode = early_pfn_to_nid(PHYS_PFN(__pa(m)));
}
-
- if (m) {
- m->flags = 0;
- m->cma = NULL;
- }
}

if (m) {
diff --git a/mm/memblock.c b/mm/memblock.c
index 6349c48154f4..131e54dd5d8d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1506,6 +1506,32 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}

+static void memblock_prep_allocation(phys_addr_t start, phys_addr_t size,
+ bool leaktrace)
+{
+ /*
+ * Skip kmemleak for those places like kasan_init() and
+ * early_pgtable_alloc() due to high volume.
+ */
+ if (leaktrace)
+ /*
+ * Memblock allocated blocks are never reported as
+ * leaks. This is because many of these blocks are
+ * only referred via the physical address which is
+ * not looked up by kmemleak.
+ */
+ kmemleak_alloc_phys(start, size, 0);
+
+ /*
+ * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
+ * require memory to be accepted before it can be used by the
+ * guest.
+ *
+ * Accept the memory of the allocated buffer.
+ */
+ accept_memory(start, size);
+}
+
/**
* memblock_alloc_range_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
@@ -1580,28 +1606,7 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
return 0;

done:
- /*
- * Skip kmemleak for those places like kasan_init() and
- * early_pgtable_alloc() due to high volume.
- */
- if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
- /*
- * Memblock allocated blocks are never reported as
- * leaks. This is because many of these blocks are
- * only referred via the physical address which is
- * not looked up by kmemleak.
- */
- kmemleak_alloc_phys(found, size, 0);
-
- /*
- * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
- * require memory to be accepted before it can be used by the
- * guest.
- *
- * Accept the memory of the allocated buffer.
- */
- accept_memory(found, size);
-
+ memblock_prep_allocation(found, size, end != MEMBLOCK_ALLOC_NOLEAKTRACE);
return found;
}

@@ -1756,6 +1761,69 @@ void * __init memblock_alloc_try_nid_raw(
false);
}

+/**
+ * memblock_alloc_hugetlb - allocate boot memory for HugeTLB pages
+ * @size: size of the memory to be allocated in bytes
+ * @nid: nid of the free memory to find, %NUMA_NO_NODE for any node
+ * @exact_nid: only allocate from the specified nid. If %false, the specified
+ * nid is tried first, and then all nodes are tried as fallback.
+ *
+ * HugeTLB pages are always aligned by their size, so the alignment matches
+ * @size. Since the memory is for userspace, mirrored memory is not used. The
+ * memory is not zeroed. Does not panic if request cannot be satisfied.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, %NULL on failure.
+ */
+void * __init memblock_alloc_hugetlb(phys_addr_t size, int nid, bool exact_nid)
+{
+ enum memblock_flags flags = choose_memblock_flags();
+ phys_addr_t addr, start = 0, end = MEMBLOCK_ALLOC_ACCESSIBLE;
+
+ memblock_dbg("%s: %llu bytes, nid=%d, exact_nid=%d %pS\n", __func__,
+ (u64)size, nid, exact_nid, (void *)_RET_IP_);
+
+ /* Don't waste mirrored memory on HugeTLB pages. */
+ flags &= ~MEMBLOCK_MIRROR;
+retry:
+ /* HugeTLB pages are always aligned by their size. */
+ addr = memblock_find_in_range_node(size, size, start, end, nid, flags);
+ if (addr)
+ goto found;
+
+ /* Try all nodes if allowed. */
+ if (numa_valid_node(nid) && !exact_nid) {
+ nid = NUMA_NO_NODE;
+ goto retry;
+ }
+
+ /* Found nothing... :-( */
+ return NULL;
+
+found:
+ /*
+ * HugeTLB pages can be preserved with KHO and no preserved memory can
+ * be in scratch. So retry if found address overlaps with scratch.
+ *
+ * Scratch areas are normally not very large, so this shouldn't take too
+ * many retries.
+ */
+ if (kho_scratch_overlap(addr, size)) {
+ if (memblock_bottom_up())
+ start = addr + size;
+ else
+ start = addr - size;
+
+ goto retry;
+ }
+
+ if (__memblock_reserve(addr, size, nid, MEMBLOCK_RSRV_KERN | MEMBLOCK_RSRV_HUGETLB))
+ return NULL;
+
+ memblock_prep_allocation(addr, size, true);
+ return phys_to_virt(addr);
+}
+
/**
* memblock_alloc_try_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
--
2.54.0.1032.g2f8565e1d1-goog