[PATCH 6/7] mm: defer freeing PTE page table for a grace period

From: Qi Zheng
Date: Sun Jul 18 2021 - 00:31:26 EST


With rcu_read_lock(), the release of the PTE page table
can be postponed. So we don't need to hold the pmd lock
anymore when we do pte_try_get()/pte_alloc_get(), which
can improve performance and simplify code logic.

Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
---
fs/proc/task_mmu.c | 8 ++++----
include/linux/pte_ref.h | 28 +++++++++++++++-------------
mm/gup.c | 2 +-
mm/hmm.c | 2 +-
mm/khugepaged.c | 4 ++--
mm/ksm.c | 2 +-
mm/madvise.c | 6 +++---
mm/memcontrol.c | 4 ++--
mm/memory.c | 14 +++++++-------
mm/mempolicy.c | 2 +-
mm/migrate.c | 2 +-
mm/mincore.c | 2 +-
mm/mprotect.c | 2 +-
mm/page_vma_mapped.c | 2 +-
mm/pagewalk.c | 2 +-
mm/pte_ref.c | 10 +++++++++-
mm/swapfile.c | 2 +-
17 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b3cf4b8a91d6..f3c9c984bc29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -584,7 +584,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
goto out;
/*
* The mmap_lock held all the way back in m_start() is what
@@ -1148,7 +1148,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
}

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
return 0;

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -1482,7 +1482,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
return 0;
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

- if (!pte_try_get(walk->mm, pmdp))
+ if (!pte_try_get(pmdp))
return 0;

/*
@@ -1824,7 +1824,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
return 0;
#endif
- if (!pte_try_get(walk->mm, pmd))
+ if (!pte_try_get(pmd))
return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index 695fbe8b991b..f4d20faab684 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -74,16 +74,17 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp)
* i_mmap_lock or when parallel threads are excluded by other means
* which can make @pmdp entry stable.
*/
-static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+static inline bool pte_try_get(pmd_t *pmdp)
{
bool retval = true;
- spinlock_t *ptl;
+ pmd_t pmdval;

- ptl = pmd_lock(mm, pmdp);
- if (pmd_leaf(*pmdp) || !pmd_present(*pmdp) ||
- !pte_get_unless_zero(pmdp))
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pmdp);
+ if (pmd_leaf(pmdval) || !pmd_present(pmdval) ||
+ !pte_get_unless_zero(&pmdval))
retval = false;
- spin_unlock(ptl);
+ rcu_read_unlock();

return retval;
}
@@ -129,21 +130,22 @@ static inline void pte_put_vmf(struct vm_fault *vmf)

static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp)
{
- if (!pte_try_get(mm, pmdp))
+ if (!pte_try_get(pmdp))
return __pte_alloc_try_get(mm, pmdp);
return 1;
}

static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
{
- spinlock_t *ptl;
+ pmd_t pmdval;

- ptl = pmd_lock(mm, pmdp);
- if (pmd_none(*pmdp) || !pte_get_unless_zero(pmdp)) {
- spin_unlock(ptl);
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pmdp);
+ if (pmd_none(pmdval) || !pte_get_unless_zero(&pmdval)) {
+ rcu_read_unlock();
return __pte_alloc_get(mm, pmdp);
}
- spin_unlock(ptl);
+ rcu_read_unlock();
return 0;
}
#else
@@ -173,7 +175,7 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp)
return true;
}

-static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+static inline bool pte_try_get(pmd_t *pmdp)
{
return true;
}
diff --git a/mm/gup.c b/mm/gup.c
index 3e2a153cb18e..a5be18e349cd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -503,7 +503,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);

- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
return no_page_table(vma, flags);

ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/hmm.c b/mm/hmm.c
index 29bb379510cc..d0e767c5fbb6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}

- if (!pte_try_get(walk->mm, pmdp))
+ if (!pte_try_get(pmdp))
goto again;

ptep = pte_offset_map(pmdp, addr);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e6c4d1b7a12a..c653edd75345 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1240,7 +1240,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out;
}

- if (!pte_try_get(mm, pmd)) {
+ if (!pte_try_get(pmd)) {
result = SCAN_PMD_NULL;
goto out;
}
@@ -1469,7 +1469,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!pmd)
goto drop_hpage;

- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);

diff --git a/mm/ksm.c b/mm/ksm.c
index 2e106f58dad0..5671683890c0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1133,7 +1133,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;

- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
goto out;

mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
diff --git a/mm/madvise.c b/mm/madvise.c
index 4c4b35292212..0e849bbf268b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -193,7 +193,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
return 0;

for (index = start; index != end; index += PAGE_SIZE) {
@@ -396,7 +396,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
return 0;
#endif
- if (!pte_try_get(vma->vm_mm, pmd))
+ if (!pte_try_get(pmd))
return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -596,7 +596,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
goto next;

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
- pmd_trans_unstable(pmd)) || !pte_try_get(mm, pmd))
+ pmd_trans_unstable(pmd)) || !pte_try_get(pmd))
return 0;
nr_put++;

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f19e5f2cd18..f8c1cabdd259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5835,7 +5835,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
}

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -6058,7 +6058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
return 0;
retry:
- if (!pte_try_get(vma->vm_mm, pmd))
+ if (!pte_try_get(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c
index 242ed135bde4..c8ee0074c730 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1143,7 +1143,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
if (pmd_none_or_clear_bad(src_pmd))
continue;

- if (!pte_try_get(src_mm, src_pmd))
+ if (!pte_try_get(src_pmd))
goto retry;
if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
addr, next)) {
@@ -1481,7 +1481,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
*/
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
- !pte_try_get(tlb->mm, pmd))
+ !pte_try_get(pmd))
goto next;

next = zap_pte_range(tlb, vma, pmd, addr, next, details);
@@ -2608,7 +2608,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
continue;
pmd_clear_bad(pmd);
}
- if (!create && !pte_try_get(mm, pmd))
+ if (!create && !pte_try_get(pmd))
goto retry;
err = apply_to_pte_range(mm, pmd, addr, next,
fn, data, create, mask);
@@ -4078,7 +4078,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
}
} else if (pmd_devmap_trans_unstable(vmf->pmd)) { /* See comment in handle_pte_fault() */
return 0;
- } else if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
+ } else if (!pte_try_get(vmf->pmd)) {
goto retry;
}

@@ -4319,7 +4319,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
goto out;
} else {
- if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
+ if (!pte_try_get(vmf->pmd)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
@@ -4579,7 +4579,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;

- if (!pte_try_get(vmf->vma->vm_mm, vmf->pmd))
+ if (!pte_try_get(vmf->pmd))
goto retry;

if (IS_ENABLED(CONFIG_FREE_USER_PTE))
@@ -5000,7 +5000,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
(address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cbb3640717ff..b19243d8fe56 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -520,7 +520,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
/* THP was split, fall through to pte walk */

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(walk->mm, pmd))
+ !pte_try_get(pmd))
return 0;

mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a94e8558b2c..e1a2169ab9e9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2265,7 +2265,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (unlikely(pmd_bad(*pmdp)))
return migrate_vma_collect_skip(start, end, walk);

- if (!pte_try_get(mm, pmdp))
+ if (!pte_try_get(pmdp))
goto again;
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
arch_enter_lazy_mmu_mode();
diff --git a/mm/mincore.c b/mm/mincore.c
index e21e271a7657..76eb31aaeef9 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -115,7 +115,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}

if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(walk->mm, pmd)) {
+ !pte_try_get(pmd)) {
__mincore_unmapped_range(addr, end, vma, vec);
goto out;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9cbd0848c5c5..8b387f8386c4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -319,7 +319,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
/* fall through, the trans huge pmd just split */
}
- if (!pte_try_get(vma->vm_mm, pmd))
+ if (!pte_try_get(pmd))
goto retry;
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
cp_flags);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eb84fa5825c0..c49bbff7aa60 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -259,7 +259,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!pte_try_get(pvmw->vma->vm_mm, pvmw->pmd))
+ if (!pte_try_get(pvmw->pmd))
goto retry;
if (!map_pte(pvmw))
goto next_pte;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 4080a88d7852..c7439a2e85f7 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -152,7 +152,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
} else {
if (!walk->no_vma) {
- if (!pte_try_get(walk->mm, pmd))
+ if (!pte_try_get(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
pte_put(walk->mm, pmd, addr);
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index 1b8d9828d513..7fd3d687a9cd 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -26,6 +26,14 @@ static inline void pte_free_debug(pmd_t pmd)
}
#endif

+static void pte_free_rcu(struct rcu_head *rcu)
+{
+ struct page *page = container_of(rcu, struct page, rcu_head);
+
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
+}
+
void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
{
struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
@@ -39,7 +47,7 @@ void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
pte_free_debug(pmd);
flush_tlb_range(&vma, addr, addr + PMD_SIZE);
mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(pmd));
+ call_rcu(&pmd_pgtable(pmd)->rcu_head, pte_free_rcu);
}

static inline void __pte_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6153283be500..fe6f7c6d2849 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2024,7 +2024,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, type,
frontswap, fs_pages_to_unuse);
--
2.11.0