diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 57ff287caf6b..1e7e6543ebca 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * The page table that maps the page has been already unlinked
+ * from the page table tree and this process cannot get
+ * additinal pin on the page.
I'd recommend this wording instead, for the last two lines:
* from the page table tree. Therefore, this page will not
* normally receive any additional pins.
I guess I'm not clear enough.
The point is that the page cannot get any new pins from this process. It
can get new pin from other process after the check. But it is fine because
if the page is mapped multiple times it has to be write-protected (CoW
after fork()) and we can rely that page's content will not change under
us.
Does it make sense? Wording suggestions are welcome.
+ *
+ * New pins can come later if the page is shared across fork,
+ * but not for the this process. It is fine. The other process
+ * cannot write to the page, only trigger CoW.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (total_mapcount(page) + PageSwapCache(page) !=
+ page_count(page)) {
I think it's time to put that logic ( "does this page have any extra references")
into a small function. It's already duplicated once below. And the documentation is
duplicated as well.
Fair enough.
But comments have to stay where they are. Because the context is
different. The first time we check speculatively, before the page table is
unlinked from the page table tree and this check is inherintly racy.
Unlike the second one.
I took a quick peek at this patch because, after adding pin_user_pages*() APIs earlier
to complement get_user_pages*(), I had a moment of doubt here: what if I'd done it in
a way that required additional logic here? Fortunately, that's not the case: all
pin_user_pages() calls on huge pages take a "primary/real" refcount, in addition
to scribbling into the compound_pincount_ptr() area. whew. :)
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
@@ -672,7 +679,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
@@ -1206,12 +1212,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- /*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
- */
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ /* Check if the page has any GUP (or other external) pins */
+ if (total_mapcount(page) + PageSwapCache(page) !=
+ page_count(page)) {> result = SCAN_PAGE_COUNT;
goto out_unmap;
}