[PATCH 6/8] hugetlb: vmemmap optimizations when demoting hugetlb pages

From: Mike Kravetz
Date: Mon Aug 16 2021 - 18:50:48 EST


When demoting a hugetlb page with optimized vmemmap, we allocate vmemmap
for the entire address range represented by the huge page. Then, we
split the huge page into huge pages of a smaller size. When preparing
these smaller huge pages for use, we optimize vmemmap for their
associated address range and free vmemmap pages.

It is sub-optimal to allocate all the vmemmap pages associated with the
original page and then free most of those pages when preparing the
smaller huge pages after the split. Instead, calculate the number of
vmemmap pages needed after the split and just allocate that number of
pages. Then, only populate those areas of the vmmap required for the
smaller huge pages.

Introduce two new routines:
- demote_huge_page_vmemmap - This has knowledge of the hugetlb demote
process and will calculate the number of huge pages needed for the
smaller huge pages after the split. It also creates a 'demote_mask'
that is used to indicate where within the address range of the
original huge page the smaller huge pages will reside. This mask is
used to map vmemmap pages within the range.
- vmemmap_remap_demote - This is the routine which actually allocates
the vmemmap pages and performs vmemmap manipulations based on the
passed demote_mask.

These routines will be used in a subsequent patch.

Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
include/linux/mm.h | 4 ++
mm/hugetlb_vmemmap.c | 60 +++++++++++++++++++++
mm/hugetlb_vmemmap.h | 6 +++
mm/sparse-vmemmap.c | 123 +++++++++++++++++++++++++++++++++++++++++--
4 files changed, 189 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4df7b0a437a8..5302ab4aa260 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3172,6 +3172,10 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse);
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask);
+int vmemmap_remap_demote(unsigned long start, unsigned long end,
+ unsigned long reuse, unsigned long demote_nr_pages,
+ unsigned long demote_mask,
+ unsigned long demote_map_pages, gfp_t gfp_mask);

void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c540c21e26f5..c82d60398c16 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -264,6 +264,66 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
SetHPageVmemmapOptimized(head);
}

+/*
+ * vmammap pages will be allocated and mapped such that this range which
+ * previously represented a single huge page will now represent a set of
+ * pages of a smaller size.
+ */
+int demote_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ int ret;
+ unsigned long vmemmap_addr = (unsigned long)head;
+ unsigned long vmemmap_end, vmemmap_reuse;
+ unsigned long demote_mask;
+ unsigned long demote_nr_pages;
+ struct hstate *target;
+
+ VM_BUG_ON(!h->demote_order);
+ if (!HPageVmemmapOptimized(head))
+ return 0;
+
+ target = size_to_hstate(PAGE_SIZE << h->demote_order);
+
+ /* Number of vmemmap pages required to demote page */
+ demote_nr_pages = pages_per_huge_page(h) / pages_per_huge_page(target);
+ demote_nr_pages *= RESERVE_VMEMMAP_NR;
+ demote_nr_pages -= RESERVE_VMEMMAP_NR; /* pages currently present */
+
+ /*
+ * mask to identify where within the range new smaller pages will
+ * reside. This will be used to map new vmemmap pages.
+ */
+ demote_mask = ((unsigned long) pages_per_huge_page(target) *
+ sizeof(struct page)) - 1;
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ /*
+ * The range [@vmemmap_addr, @vmemmap_end) represents a single huge
+ * page of size h->order. It is vmemmap optimized and is only mapped
+ * with RESERVE_VMEMMAP_NR pages. The huge page will be split into
+ * multiple pages of a smaller size (h->demote_order). vmemmap pages
+ * must be callocated for each of these smaller size pages and
+ * appropriately mapped.
+ */
+ ret = vmemmap_remap_demote(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+ demote_nr_pages, demote_mask,
+ RESERVE_VMEMMAP_NR,
+ GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+
+ if (!ret) {
+ int i;
+
+ for (i = pages_per_huge_page(target);
+ i < pages_per_huge_page(h);
+ i += pages_per_huge_page(target))
+ SetHPageVmemmapOptimized(head + i);
+ }
+
+ return ret;
+}
+
void __init hugetlb_vmemmap_init(struct hstate *h)
{
unsigned int nr_pages = pages_per_huge_page(h);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index cb2bef8f9e73..44382504efc3 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -13,6 +13,7 @@
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+int demote_huge_page_vmemmap(struct hstate *h, struct page *head);
void hugetlb_vmemmap_init(struct hstate *h);

/*
@@ -33,6 +34,11 @@ static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
{
}

+static inline int demote_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ return 0;
+}
+
static inline void hugetlb_vmemmap_init(struct hstate *h)
{
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bdce883f9286..ac2681bf006b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -43,6 +43,15 @@
* @reuse_addr: the virtual address of the @reuse_page page.
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
* or is mapped from.
+ * @demote_mask demote specific. mask to know virtual address of
+ * where to start mapping pages during a demote operation.
+ * @demote_map_pages demote specific. number of pages which mapped for
+ * each demoted page.
+ * @demote_tmp_count demote specific. counter for the number of pages
+ * mapped per page.
+ * @demote_tmp_addr demote specific. when more then one page must be
+ * mapped for each demoted size page, virtual address
+ * of the next page to be mapped.
*/
struct vmemmap_remap_walk {
void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -51,6 +60,10 @@ struct vmemmap_remap_walk {
struct page *reuse_page;
unsigned long reuse_addr;
struct list_head *vmemmap_pages;
+ unsigned long demote_mask;
+ unsigned long demote_map_pages;
+ unsigned long demote_tmp_count;
+ unsigned long demote_tmp_addr;
};

static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
@@ -262,6 +275,51 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}

+static void vmemmap_demote_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ if (!(addr & walk->demote_mask)) {
+ /* head page */
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+ /*
+ * after mapping head page, set demote_tmp_reuse for
+ * the following tail page to be mapped (if any).
+ */
+ walk->demote_tmp_count = walk->demote_map_pages;
+ if (--walk->demote_tmp_count)
+ walk->demote_tmp_addr = addr + PAGE_SIZE;
+ } else if (addr == walk->demote_tmp_addr) {
+ /* first tall page */
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+ if (--walk->demote_tmp_count) {
+ walk->demote_tmp_addr = addr + PAGE_SIZE;
+ } else {
+ walk->demote_tmp_addr = 0UL;
+ /* remaining tail pages mapped to this page */
+ walk->reuse_page = page;
+ }
+ } else {
+ /* remaining tail pages */
+ pgprot_t pgprot = PAGE_KERNEL_RO;
+ pte_t entry = mk_pte(walk->reuse_page, pgprot);
+
+ page = pte_page(*pte);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+}
+
/**
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
* to the page which @reuse is mapped to, then free vmemmap
@@ -327,11 +385,9 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
return ret;
}

-static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+static int __alloc_vmemmap_pages(unsigned long nr_pages, int nid,
gfp_t gfp_mask, struct list_head *list)
{
- unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
- int nid = page_to_nid((struct page *)start);
struct page *page, *next;

while (nr_pages--) {
@@ -348,6 +404,21 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
return -ENOMEM;
}

+static int alloc_vmemmap_pages(unsigned long nr_pages, int nid,
+ gfp_t gfp_mask, struct list_head *list)
+{
+ return __alloc_vmemmap_pages(nr_pages, nid, gfp_mask, list);
+}
+
+static int alloc_vmemmap_range(unsigned long start, unsigned long end,
+ gfp_t gfp_mask, struct list_head *list)
+{
+ unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+ int nid = page_to_nid((struct page *)start);
+
+ return __alloc_vmemmap_pages(nr_pages, nid, gfp_mask, list);
+}
+
/**
* vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
* to the page which is from the @vmemmap_pages
@@ -374,7 +445,51 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
/* See the comment in the vmemmap_remap_free(). */
BUG_ON(start - reuse != PAGE_SIZE);

- if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ if (alloc_vmemmap_range(start, end, gfp_mask, &vmemmap_pages))
+ return -ENOMEM;
+
+ mmap_read_lock(&init_mm);
+ vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
+
+ return 0;
+}
+
+/**
+ * vmemmap_remap_demote - remap the optimized vmemmap virtual address range
+ * for a huge page to accommodate splitting that huge
+ * page into pages of a smaller size. That smaller size
+ * is specified by demote_mask.
+ * @start: start address of the vmemmap virtual address range to remap
+ * for smaller pages.
+ * @end: end address of the vmemmap virtual address range to remap.
+ * @reuse: reuse address.
+ * @demote_nr_pages: number of vmammap pages to allocate for remapping.
+ * @demote_mask: mask specifying where to perform remapping within the passed
+ * range.
+ * @demote_map_pages: number of pages to map for each demoted page
+ * @gfp_mask: GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vmemmap_remap_demote(unsigned long start, unsigned long end,
+ unsigned long reuse, unsigned long demote_nr_pages,
+ unsigned long demote_mask,
+ unsigned long demote_map_pages, gfp_t gfp_mask)
+{
+ LIST_HEAD(vmemmap_pages);
+ int nid = page_to_nid((struct page *)start);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_demote_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ .demote_mask = demote_mask,
+ .demote_map_pages = demote_map_pages,
+ };
+
+ might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
+ if (alloc_vmemmap_pages(demote_nr_pages, nid, gfp_mask, &vmemmap_pages))
return -ENOMEM;

mmap_read_lock(&init_mm);
--
2.31.1