[RFC PATCH 19/26] hugetlb: add HGM support for copy_hugetlb_page_range

From: James Houghton
Date: Fri Jun 24 2022 - 13:39:01 EST


This allows fork() to work with high-granularity mappings. The page
table structure is copied such that partially mapped regions will remain
partially mapped in the same way for the new process.

Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
mm/hugetlb.c | 74 +++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aadfcee947cf..0ec2f231524e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4851,7 +4851,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
- struct page *ptepage;
+ struct hugetlb_pte src_hpte, dst_hpte;
+ struct page *ptepage, *hpage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
@@ -4878,17 +4879,44 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
i_mmap_lock_read(mapping);
}

- for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
+ addr = src_vma->vm_start;
+ while (addr < src_vma->vm_end) {
spinlock_t *src_ptl, *dst_ptl;
+ unsigned long hpte_sz;
src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
+ if (!src_pte) {
+ addr += sz;
continue;
+ }
dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
}

+ hugetlb_pte_populate(&src_hpte, src_pte, huge_page_shift(h));
+ hugetlb_pte_populate(&dst_hpte, dst_pte, huge_page_shift(h));
+
+ if (hugetlb_hgm_enabled(src_vma)) {
+ BUG_ON(!hugetlb_hgm_enabled(dst_vma));
+ ret = hugetlb_walk_to(src, &src_hpte, addr,
+ PAGE_SIZE, /*stop_at_none=*/true);
+ if (ret)
+ break;
+ ret = huge_pte_alloc_high_granularity(
+ &dst_hpte, dst, dst_vma, addr,
+ hugetlb_pte_shift(&src_hpte),
+ HUGETLB_SPLIT_NONE,
+ /*write_locked=*/false);
+ if (ret)
+ break;
+
+ src_pte = src_hpte.ptep;
+ dst_pte = dst_hpte.ptep;
+ }
+
+ hpte_sz = hugetlb_pte_size(&src_hpte);
+
/*
* If the pagetables are shared don't copy or take references.
* dst_pte == src_pte is the common case of src/dest sharing.
@@ -4899,16 +4927,19 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* after taking the lock below.
*/
dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+ if ((dst_pte == src_pte) || !hugetlb_pte_none(&dst_hpte)) {
+ addr += hugetlb_pte_size(&src_hpte);
continue;
+ }

- dst_ptl = huge_pte_lock(h, dst, dst_pte);
- src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
+ dst_ptl = hugetlb_pte_lock(dst, &dst_hpte);
+ src_ptl = hugetlb_pte_lockptr(src, &src_hpte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
again:
- if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ if (hugetlb_pte_none(&src_hpte) ||
+ !hugetlb_pte_none(&dst_hpte)) {
/*
* Skip if src entry none. Also, skip in the
* unlikely case dst entry !none as this implies
@@ -4931,11 +4962,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (userfaultfd_wp(src_vma) && uffd_wp)
entry = huge_pte_mkuffd_wp(entry);
set_huge_swap_pte_at(src, addr, src_pte,
- entry, sz);
+ entry, hpte_sz);
}
if (!userfaultfd_wp(dst_vma) && uffd_wp)
entry = huge_pte_clear_uffd_wp(entry);
- set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ set_huge_swap_pte_at(dst, addr, dst_pte, entry,
+ hpte_sz);
} else if (unlikely(is_pte_marker(entry))) {
/*
* We copy the pte marker only if the dst vma has
@@ -4946,7 +4978,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
- get_page(ptepage);
+ hpage = compound_head(ptepage);
+ get_page(hpage);

/*
* Failing to duplicate the anon rmap is a rare case
@@ -4959,9 +4992,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* sleep during the process.
*/
if (!PageAnon(ptepage)) {
- page_dup_file_rmap(ptepage, true);
+ /* Only dup_rmap once for a page */
+ if (IS_ALIGNED(addr, sz))
+ page_dup_file_rmap(hpage, true);
} else if (page_try_dup_anon_rmap(ptepage, true,
src_vma)) {
+ if (hugetlb_hgm_enabled(src_vma)) {
+ ret = -EINVAL;
+ break;
+ }
+ BUG_ON(!IS_ALIGNED(addr, hugetlb_pte_size(&src_hpte)));
pte_t src_pte_old = entry;
struct page *new;

@@ -4970,13 +5010,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/* Do not use reserve as it's private owned */
new = alloc_huge_page(dst_vma, addr, 1);
if (IS_ERR(new)) {
- put_page(ptepage);
+ put_page(hpage);
ret = PTR_ERR(new);
break;
}
- copy_user_huge_page(new, ptepage, addr, dst_vma,
+ copy_user_huge_page(new, hpage, addr, dst_vma,
npages);
- put_page(ptepage);
+ put_page(hpage);

/* Install the new huge page if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -4994,6 +5034,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
hugetlb_install_page(dst_vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
continue;
}

@@ -5010,10 +5051,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}

set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(npages, dst);
+ hugetlb_count_add(
+ hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
+ dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
}

if (cow) {
--
2.37.0.rc0.161.g10f37bed90-goog