[RFC PATCH 1/1] powerpc/pgtable: Skip serialize_against_pte_lookup() when unmapping

From: Leonardo Bras
Date: Mon Jan 13 2020 - 16:12:18 EST


If a process (qemu) with a lot of CPUs (128) try to munmap() a large
chunk of memory (496GB) mapped with THP, it takes an average of 275
seconds, which can cause a lot of problems to the load (in qemu case,
the guest will lock for this time).

Trying to find the source of this bug, I found out most of this time is
spent on serialize_against_pte_lookup(). This function will take a lot
of time in smp_call_function_many() if there is more than a couple CPUs
running the user process. Since it has to happen to all THP mapped, it
will take a very long time for large amounts of memory.

By the docs, serialize_against_pte_lookup() is needed in order to avoid
pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless
pagetable walk, to happen concurrently with THP splitting/collapsing.

In this case, as the page is being munmapped, there is no need to call
serialize_against_pte_lookup(), given it will not be used after or
during munmap.

This patch does so by adding option to skip serializing on
radix__pmdp_huge_get_and_clear(). This option is used by the proxy
__pmdp_huge_get_and_clear(), that is called with 'unmap == true' on
an (new) arch version of pmdp_huge_get_and_clear_full(), and with
'unmap == false' on pmdp_huge_get_and_clear(), that is used on
generic code.

pmdp_huge_get_and_clear_full() is only called in zap_huge_pmd(), so
it's safe to assume it will always be called on memory that will be
unmapped.

On my workload (qemu: 128vcpus + 500GB), I could see munmap's time
reduction from 275 seconds to 39ms.

Signed-off-by: Leonardo Bras <leonardo@xxxxxxxxxxxxx>
---
arch/powerpc/include/asm/book3s/64/pgtable.h | 25 +++++++++++++++++---
arch/powerpc/include/asm/book3s/64/radix.h | 3 ++-
arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +++--
3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b01624e5c467..5e3e7c48624a 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1243,14 +1243,21 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);

#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
+static inline pmd_t __pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp,
+ bool unmap)
{
if (radix_enabled())
- return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
+ return radix__pmdp_huge_get_and_clear(mm, addr, pmdp, !unmap);
return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
}

+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ return __pmdp_huge_get_and_clear(mm, addr, pmdp, false);
+}
+
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
@@ -1337,6 +1344,18 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
pte_t *, pte_t, pte_t);

+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmdp,
+ int full)
+{
+ /*
+ * Called only on unmapping
+ */
+ return __pmdp_huge_get_and_clear(mm, address, pmdp, true);
+}
+
/*
* Returns true for a R -> RW upgrade of pte
*/
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..148874aa5260 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -253,7 +253,8 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp);
+ unsigned long addr, pmd_t *pmdp,
+ bool serialize);
static inline int radix__has_transparent_hugepage(void)
{
/* For radix 2M at PMD level means thp */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 974109bb85db..eac8409cd316 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1007,7 +1007,8 @@ pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
}

pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
+ unsigned long addr, pmd_t *pmdp,
+ bool serialize)
{
pmd_t old_pmd;
unsigned long old;
@@ -1024,7 +1025,8 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
* different code paths. So make sure we wait for the parallel
* find_current_mm_pte to finish.
*/
- serialize_against_pte_lookup(mm);
+ if (serialize)
+ serialize_against_pte_lookup(mm);
return old_pmd;
}

--
2.24.1