[PATCH 15/24] huge tmpfs: rework page_referenced_one and try_to_unmap_one

From: Hugh Dickins
Date: Fri Feb 20 2015 - 23:15:15 EST


page_referenced_one() currently decides whether to go the huge
pmd route or the small pte route by looking at PageTransHuge(page).
But with huge tmpfs pages simultaneously mappable as small and as huge,
it's not deducible from page flags which is the case. And the "helpers"
page_check_address, page_check_address_pmd, mm_find_pmd are designed to
hide the information we need now, instead of helping.

Open code (as it once was) with pgd,pud,pmd,pte: get *pmd speculatively,
and if it appears pmd_trans_huge, then acquire pmd_lock and recheck.
The same code is then valid for anon THP and for huge tmpfs, without
any page flag test.

Copy from this template in try_to_unmap_one(), to prepare for its
use on huge tmpfs pages (whereas anon THPs have already been split in
add_to_swap() before getting here); with a stub for unmap_team_by_pmd()
until a later patch implements it. But unlike page_referenced_one(),
here we must allow for hugetlbfs pages (including non-pmd-based ones),
so must still use huge_pte_offset instead of pmd_trans_huge for those.

Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
---
include/linux/pageteam.h | 6 +
mm/rmap.c | 158 +++++++++++++++++++++++++++++--------
2 files changed, 133 insertions(+), 31 deletions(-)

--- thpfs.orig/include/linux/pageteam.h 2015-02-20 19:34:06.224004747 -0800
+++ thpfs/include/linux/pageteam.h 2015-02-20 19:34:37.851932430 -0800
@@ -29,4 +29,10 @@ static inline struct page *team_head(str
return head;
}

+/* Temporary stub for mm/rmap.c until implemented in mm/huge_memory.c */
+static inline void unmap_team_by_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd, struct page *page)
+{
+}
+
#endif /* _LINUX_PAGETEAM_H */
--- thpfs.orig/mm/rmap.c 2015-02-20 19:33:51.496038422 -0800
+++ thpfs/mm/rmap.c 2015-02-20 19:34:37.851932430 -0800
@@ -44,6 +44,7 @@

#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/pageteam.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
@@ -607,7 +608,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm,
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
- pmd_t pmde;
+ pmd_t pmdval;

pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -620,12 +621,12 @@ pmd_t *mm_find_pmd(struct mm_struct *mm,
pmd = pmd_offset(pud, address);
/*
* Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
- * without holding anon_vma lock for write. So when looking for a
- * genuine pmde (in which to find pte), test present and !THP together.
+ * without locking out concurrent rmap lookups. So when looking for a
+ * pmd entry, in which to find a pte, test present and !THP together.
*/
- pmde = *pmd;
+ pmdval = *pmd;
barrier();
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ if (!pmd_present(pmdval) || pmd_trans_huge(pmdval))
pmd = NULL;
out:
return pmd;
@@ -718,22 +719,41 @@ static int page_referenced_one(struct pa
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pmd_t pmdval;
+ pte_t *pte;
spinlock_t *ptl;
int referenced = 0;
struct page_referenced_arg *pra = arg;

- if (unlikely(PageTransHuge(page))) {
- pmd_t *pmd;
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return SWAP_AGAIN;

- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address_pmd().
- */
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return SWAP_AGAIN;
+
+ pmd = pmd_offset(pud, address);
+again:
+ /* See comment in mm_find_pmd() for why we use pmdval+barrier here */
+ pmdval = *pmd;
+ barrier();
+ if (!pmd_present(pmdval))
+ return SWAP_AGAIN;
+
+ if (pmd_trans_huge(pmdval)) {
+ if (pmd_page(pmdval) != page)
return SWAP_AGAIN;

+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_same(*pmd, pmdval)) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
if (vma->vm_flags & VM_LOCKED) {
spin_unlock(ptl);
pra->vm_flags |= VM_LOCKED;
@@ -745,15 +765,22 @@ static int page_referenced_one(struct pa
referenced++;
spin_unlock(ptl);
} else {
- pte_t *pte;
+ pte = pte_offset_map(pmd, address);

- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address().
- */
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
+ /* Make a quick check before getting the lock */
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
return SWAP_AGAIN;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+
+ if (!pte_present(*pte) ||
+ page_to_pfn(page) != pte_pfn(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }

if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
@@ -1179,15 +1206,84 @@ static int try_to_unmap_one(struct page
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pmd_t pmdval;
+ pte_t *pte = NULL;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;

- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- goto out;
+ if (unlikely(PageHuge(page))) {
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return ret;
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ goto check;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return ret;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return ret;
+
+ pmd = pmd_offset(pud, address);
+again:
+ /* See comment in mm_find_pmd() for why we use pmdval+barrier here */
+ pmdval = *pmd;
+ barrier();
+ if (!pmd_present(pmdval))
+ return ret;
+
+ if (pmd_trans_huge(pmdval)) {
+ if (pmd_page(pmdval) != page)
+ return ret;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_same(*pmd, pmdval)) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ if (!(flags & TTU_IGNORE_MLOCK)) {
+ if (vma->vm_flags & VM_LOCKED)
+ goto out_mlock;
+ if (flags & TTU_MUNLOCK)
+ goto out_unmap;
+ }
+ if (!(flags & TTU_IGNORE_ACCESS) &&
+ pmdp_clear_flush_young_notify(vma, address, pmd)) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+
+ spin_unlock(ptl);
+ unmap_team_by_pmd(vma, address, pmd, page);
+ return ret;
+ }
+
+ pte = pte_offset_map(pmd, address);
+
+ /* Make a quick check before getting the lock */
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return ret;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+check:
+ spin_lock(ptl);
+
+ if (!pte_present(*pte) ||
+ page_to_pfn(page) != pte_pfn(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+ }

/*
* If the page is mlock()d, we cannot swap it out.
@@ -1197,7 +1293,6 @@ static int try_to_unmap_one(struct page
if (!(flags & TTU_IGNORE_MLOCK)) {
if (vma->vm_flags & VM_LOCKED)
goto out_mlock;
-
if (flags & TTU_MUNLOCK)
goto out_unmap;
}
@@ -1287,16 +1382,17 @@ static int try_to_unmap_one(struct page
page_cache_release(page);

out_unmap:
- pte_unmap_unlock(pte, ptl);
+ spin_unlock(ptl);
+ if (pte)
+ pte_unmap(pte);
if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
mmu_notifier_invalidate_page(mm, address);
-out:
return ret;

out_mlock:
- pte_unmap_unlock(pte, ptl);
-
-
+ spin_unlock(ptl);
+ if (pte)
+ pte_unmap(pte);
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/