Re: [RFC PATCH 13/18] mm: add try_to_free_user_pte() helper

From: Qi Zheng
Date: Sat Apr 30 2022 - 09:36:24 EST




On 2022/4/29 9:35 PM, Qi Zheng wrote:
Normally, the percpu_ref of the user PTE page table page is in
percpu mode. This patch add try_to_free_user_pte() to switch
the percpu_ref to atomic mode and check if it is 0. If the
percpu_ref is 0, which means that no one is using the user PTE
page table page, then we can safely reclaim it.

Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
---
include/linux/pte_ref.h | 7 +++
mm/pte_ref.c | 99 ++++++++++++++++++++++++++++++++++++++++-
2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index bfe620038699..379c3b45a6ab 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -16,6 +16,8 @@ void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
bool pte_tryget(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
void __pte_put(pgtable_t page);
void pte_put(pte_t *ptep);
+void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ bool switch_back);
#else /* !CONFIG_FREE_USER_PTE */
@@ -47,6 +49,11 @@ static inline void pte_put(pte_t *ptep)
{
}
+static inline void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, bool switch_back)
+{
+}
+
#endif /* CONFIG_FREE_USER_PTE */
#endif /* _LINUX_PTE_REF_H */
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index 5b382445561e..bf9629272c71 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -8,6 +8,9 @@
#include <linux/pte_ref.h>
#include <linux/percpu-refcount.h>
#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
#ifdef CONFIG_FREE_USER_PTE
@@ -44,8 +47,6 @@ void pte_ref_free(pgtable_t pte)
kfree(ref);
}
-void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) {}
-
/*
* pte_tryget - try to get the pte_ref of the user PTE page table page
* @mm: pointer the target address space
@@ -102,4 +103,98 @@ void pte_put(pte_t *ptep)
}
EXPORT_SYMBOL(pte_put);
+#ifdef CONFIG_DEBUG_VM
+void pte_free_debug(pmd_t pmd)
+{
+ pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd);
+ int i = 0;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ BUG_ON(!pte_none(*ptep++));
+}
+#else
+static inline void pte_free_debug(pmd_t pmd)
+{
+}
+#endif
+
+static inline void pte_free_rcu(struct rcu_head *rcu)
+{
+ struct page *page = container_of(rcu, struct page, rcu_head);
+
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
+}
+
+/*
+ * free_user_pte - free the user PTE page table page
+ * @mm: pointer the target address space
+ * @pmd: pointer to a PMD
+ * @addr: start address of the tlb range to be flushed
+ *
+ * Context: The pmd range has been unmapped and TLB purged. And the user PTE
+ * page table page will be freed by rcu handler.
+ */
+void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ spinlock_t *ptl;
+ pmd_t pmdval;
+
+ ptl = pmd_lock(mm, pmd);
+ pmdval = *pmd;
+ if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+ spin_unlock(ptl);
+ return;
+ }
+ pmd_clear(pmd);
+ flush_tlb_range(&vma, addr, addr + PMD_SIZE);
+ spin_unlock(ptl);
+
+ pte_free_debug(pmdval);
+ mm_dec_nr_ptes(mm);
+ call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu);
+}
+
+/*
+ * try_to_free_user_pte - try to free the user PTE page table page
+ * @mm: pointer the target address space
+ * @pmd: pointer to a PMD
+ * @addr: virtual address associated with pmd
+ * @switch_back: indicates if switching back to percpu mode is required
+ */
+void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ bool switch_back)
+{
+ pgtable_t pte;
+
+ if (&init_mm == mm)
+ return;
+
+ if (!pte_tryget(mm, pmd, addr))
+ return;
+ pte = pmd_pgtable(*pmd);
+ percpu_ref_switch_to_atomic_sync(pte->pte_ref);
+ rcu_read_lock();
+ /*
+ * Here we can safely put the pte_ref because we already hold the rcu
+ * lock, which guarantees that the user PTE page table page will not
+ * be released.
+ */
+ __pte_put(pte);
+ if (percpu_ref_is_zero(pte->pte_ref)) {
+ rcu_read_unlock();
+ free_user_pte(mm, pmd, addr & PMD_MASK);
+ return;
+ }
+ rcu_read_unlock();
+
+ if (switch_back) {
+ if (pte_tryget(mm, pmd, addr)) {
+ percpu_ref_switch_to_percpu(pte->pte_ref);
+ __pte_put(pte);
+ }
+ }

We shouldn't switch back to percpu mode here, it will drastically reduce
performance.

+}
+
#endif /* CONFIG_FREE_USER_PTE */

--
Thanks,
Qi