[PATCH v3] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY

From: Mina Almasry
Date: Fri May 21 2021 - 03:44:41 EST


The userfaultfd hugetlb tests detect a resv_huge_pages underflow. This
happens when hugetlb_mcopy_atomic_pte() is called with !is_continue on
an index for which we already have a page in the cache. When this
happens, we allocate a second page, double consuming the reservation,
and then fail to insert the page into the cache and return -EEXIST.

To fix this, we first if there exists a page in the cache which already
consumed the reservation, and return -EEXIST immediately if so.

Secondly, if we fail to copy the page contents while holding the
hugetlb_fault_mutex, we will drop the mutex and return to the caller
after allocating a page that consumed a reservation. In this case there
may be a fault that double consumes the reservation. To handle this, we
free the allocated page, fix the reservations, and allocate a temporary
hugetlb page and return that to the caller. When the caller does the
copy outside of the lock, we again check the cache, and allocate a page
consuming the reservation, and copy over the contents.

Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning and the copy_huge_page_from_user() always fails, then:

./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success

Both tests succeed and produce no warnings. After the test runs
number of free/resv hugepages is correct.

Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx>
Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
Cc: Peter Xu <peterx@xxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx

---
include/linux/hugetlb.h | 4 ++
mm/hugetlb.c | 103 ++++++++++++++++++++++++++++++++++++----
mm/migrate.c | 39 +++------------
3 files changed, 103 insertions(+), 43 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b92f25ccef58..427974510965 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -194,6 +194,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
bool is_hugetlb_entry_migration(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);

+void hugetlb_copy_page(struct page *dst, struct page *src);
+
#else /* !CONFIG_HUGETLB_PAGE */

static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -379,6 +381,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

+static inline void hugetlb_copy_page(struct page *dst, struct page *src);
+
#endif /* !CONFIG_HUGETLB_PAGE */
/*
* hugepages at page global directory. If arch support
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 629aa4c2259c..cb041c97a558 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -81,6 +81,45 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);

+/*
+ * Gigantic pages are so large that we do not guarantee that page++ pointer
+ * arithmetic will work across the entire page. We need something more
+ * specialized.
+ */
+static void __copy_gigantic_page(struct page *dst, struct page *src,
+ int nr_pages)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < nr_pages;) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void hugetlb_copy_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ int nr_pages = pages_per_huge_page(h);
+
+ if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
+ __copy_gigantic_page(dst, src, nr_pages);
+ return;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
if (spool->count)
@@ -4868,19 +4907,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
- struct address_space *mapping;
- pgoff_t idx;
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
+ int ret = -ENOMEM;
struct page *page;
int writable;
-
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+ int node = huge_node(dst_vma, dst_addr, gfp_mask, &mpol, &nodemask);

if (is_continue) {
ret = -EFAULT;
@@ -4888,7 +4928,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
} else if (!*pagep) {
- ret = -ENOMEM;
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page))
goto out;
@@ -4900,12 +4947,48 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
+ /* Free the allocated page which may have
+ * consumed a reservation.
+ */
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!HPageRestoreReserve(page)) {
+ if (unlikely(hugetlb_unreserve_pages(
+ mapping->host, idx, idx + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ mapping->host);
+ }
+ put_page(page);
+
+ /* Allocate a temporary page to hold the copied
+ * contents.
+ */
+ page = alloc_migrate_huge_page(h, gfp_mask, node,
+ nodemask);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ goto out;
+ }
*pagep = page;
- /* don't free the page */
+ /* Set the outparam pagep and return to the caller to
+ * copy the contents outside the lock. Don't free the
+ * page.
+ */
goto out;
}
} else {
- page = *pagep;
+ if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ put_page(*pagep);
+ ret = -EEXIST;
+ goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ __copy_gigantic_page(page, *pagep, pages_per_huge_page(h));
+ put_page(*pagep);
*pagep = NULL;
}

diff --git a/mm/migrate.c b/mm/migrate.c
index 6b37d00890ca..d3437f9a608d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -528,28 +528,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return MIGRATEPAGE_SUCCESS;
}

-/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page. We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
- int nr_pages)
-{
- int i;
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < nr_pages; ) {
- cond_resched();
- copy_highpage(dst, src);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
static void copy_huge_page(struct page *dst, struct page *src)
{
int i;
@@ -557,19 +535,14 @@ static void copy_huge_page(struct page *dst, struct page *src)

if (PageHuge(src)) {
/* hugetlbfs page */
- struct hstate *h = page_hstate(src);
- nr_pages = pages_per_huge_page(h);
-
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
- __copy_gigantic_page(dst, src, nr_pages);
- return;
- }
- } else {
- /* thp page */
- BUG_ON(!PageTransHuge(src));
- nr_pages = thp_nr_pages(src);
+ hugetlb_copy_page(dst, src);
+ return;
}

+ /* thp page */
+ BUG_ON(!PageTransHuge(src));
+ nr_pages = thp_nr_pages(src);
+
for (i = 0; i < nr_pages; i++) {
cond_resched();
copy_highpage(dst + i, src + i);
--
2.31.1.818.g46aad6cb9e-goog