[PATCH v6 18/23] mm/hugetlb: Handle uffd-wp during fork()

From: Peter Xu
Date: Mon Nov 15 2021 - 03:04:56 EST


Firstly, we'll need to pass in dst_vma into copy_hugetlb_page_range() because
for uffd-wp it's the dst vma that matters on deciding how we should treat
uffd-wp protected ptes.

We should recognize pte markers during fork and do the pte copy if needed.

Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
include/linux/hugetlb.h | 7 +++++--
mm/hugetlb.c | 41 +++++++++++++++++++++++++++--------------
mm/memory.c | 2 +-
3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4c3ea7ee8ce8..6935b02f1081 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -137,7 +137,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr, unsigned long new_addr,
unsigned long len);
-int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
+int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
+ struct vm_area_struct *, struct vm_area_struct *);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
struct page **, struct vm_area_struct **,
unsigned long *, unsigned long *, long, unsigned int,
@@ -268,7 +269,9 @@ static inline struct page *follow_huge_addr(struct mm_struct *mm,
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
- struct mm_struct *src, struct vm_area_struct *vma)
+ struct mm_struct *src,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
BUG();
return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 16fb9cd8d9c5..cf9a0e8c32ba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4690,23 +4690,24 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
}

int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
- bool cow = is_cow_mapping(vma->vm_flags);
- struct hstate *h = hstate_vma(vma);
+ bool cow = is_cow_mapping(src_vma->vm_flags);
+ struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;

if (cow) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
- vma->vm_start,
- vma->vm_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+ src_vma->vm_start,
+ src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
} else {
/*
@@ -4718,12 +4719,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
i_mmap_lock_read(mapping);
}

- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+ dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
@@ -4758,6 +4759,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
+ bool uffd_wp = huge_pte_uffd_wp(entry);

if (is_writable_migration_entry(swp_entry) && cow) {
/*
@@ -4767,10 +4769,21 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
swp_entry = make_readable_migration_entry(
swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
+ if (userfaultfd_wp(src_vma) && uffd_wp)
+ entry = huge_pte_mkuffd_wp(entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
}
+ if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ entry = huge_pte_clear_uffd_wp(entry);
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ } else if (unlikely(is_pte_marker(entry))) {
+ /*
+ * We copy the pte marker only if the dst vma has
+ * uffd-wp enabled.
+ */
+ if (userfaultfd_wp(dst_vma))
+ set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
@@ -4785,20 +4798,20 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ if (unlikely(page_needs_cow_for_dma(src_vma, ptepage))) {
pte_t src_pte_old = entry;
struct page *new;

spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new = alloc_huge_page(vma, addr, 1);
+ new = alloc_huge_page(dst_vma, addr, 1);
if (IS_ERR(new)) {
put_page(ptepage);
ret = PTR_ERR(new);
break;
}
- copy_user_huge_page(new, ptepage, addr, vma,
+ copy_user_huge_page(new, ptepage, addr, dst_vma,
npages);
put_page(ptepage);

@@ -4808,13 +4821,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
- restore_reserve_on_error(h, vma, addr,
+ restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
}
- hugetlb_install_page(vma, dst_pte, addr, new);
+ hugetlb_install_page(dst_vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
diff --git a/mm/memory.c b/mm/memory.c
index 69a73d47513b..89715d1ec956 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1284,7 +1284,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return 0;

if (is_vm_hugetlb_page(src_vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
+ return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);

if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
--
2.32.0