Re: [PATCH RFC/RFT 3/4] riscv: Stop emitting preventive sfence.vma for new userspace mappings

From: Christophe Leroy
Date: Thu Dec 07 2023 - 11:37:16 EST


The subject says "riscv:" but it changes core part and several arch.
Maybe this commit should be split in two commits, one for API changes
that changes flush_tlb_fix_spurious_fault() to
flush_tlb_fix_spurious_write_fault() and adds
flush_tlb_fix_spurious_read_fault() including the change in memory.c,
then a second patch with the changes to riscv.

Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit :
> The preventive sfence.vma were emitted because new mappings must be made
> visible to the page table walker, either the uarch caches invalid
> entries or not.
>
> Actually, there is no need to preventively sfence.vma on new mappings for
> userspace, this should be handled only in the page fault path.
>
> This allows to drastically reduce the number of sfence.vma emitted:
>
> * Ubuntu boot to login:
> Before: ~630k sfence.vma
> After: ~200k sfence.vma
>
> * ltp - mmapstress01
> Before: ~45k
> After: ~6.3k
>
> * lmbench - lat_pagefault
> Before: ~665k
> After: 832 (!)
>
> * lmbench - lat_mmap
> Before: ~546k
> After: 718 (!)
>
> The only issue with the removal of sfence.vma in update_mmu_cache() is
> that on uarchs that cache invalid entries, those won't be invalidated
> until the process takes a fault: so that's an additional fault in those
> cases.
>
> Signed-off-by: Alexandre Ghiti <alexghiti@xxxxxxxxxxxx>
> ---
> arch/arm64/include/asm/pgtable.h | 2 +-
> arch/mips/include/asm/pgtable.h | 6 +--
> arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 ++--
> arch/riscv/include/asm/pgtable.h | 43 +++++++++++--------
> include/linux/pgtable.h | 8 +++-
> mm/memory.c | 12 +++++-
> 6 files changed, 48 insertions(+), 31 deletions(-)

Did you forget mm/pgtable-generic.c ?

>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 7f7d9b1df4e5..728f25f529a5 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
> * fault on one CPU which has been handled concurrently by another CPU
> * does not need to perform additional invalidation.
> */
> -#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
> +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) do { } while (0)

Why do you need to do that change ? Nothing is explained about that in
the commit message.

>
> /*
> * ZERO_PAGE is a global shared page that is always zero: used
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index 430b208c0130..84439fe6ed29 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -478,9 +478,9 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
> return __pgprot(prot);
> }
>
> -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> - unsigned long address,
> - pte_t *ptep)
> +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma,
> + unsigned long address,
> + pte_t *ptep)
> {
> }
>
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 1950c1b825b4..7166d56f90db 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -128,10 +128,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
> #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr)
> #endif /* CONFIG_SMP */
>
> -#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
> -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> - unsigned long address,
> - pte_t *ptep)
> +#define flush_tlb_fix_spurious_write_fault flush_tlb_fix_spurious_write_fault
> +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct *vma,
> + unsigned long address,
> + pte_t *ptep)
> {
> /*
> * Book3S 64 does not require spurious fault flushes because the PTE
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index b2ba3f79cfe9..89aa5650f104 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -472,28 +472,20 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
> struct vm_area_struct *vma, unsigned long address,
> pte_t *ptep, unsigned int nr)
> {
> - /*
> - * The kernel assumes that TLBs don't cache invalid entries, but
> - * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
> - * cache flush; it is necessary even after writing invalid entries.
> - * Relying on flush_tlb_fix_spurious_fault would suffice, but
> - * the extra traps reduce performance. So, eagerly SFENCE.VMA.
> - */
> - while (nr--)
> - local_flush_tlb_page(address + nr * PAGE_SIZE);
> }
> #define update_mmu_cache(vma, addr, ptep) \
> update_mmu_cache_range(NULL, vma, addr, ptep, 1)
>
> #define __HAVE_ARCH_UPDATE_MMU_TLB
> -#define update_mmu_tlb update_mmu_cache
> +static inline void update_mmu_tlb(struct vm_area_struct *vma,
> + unsigned long address, pte_t *ptep)
> +{
> + flush_tlb_range(vma, address, address + PAGE_SIZE);
> +}
>
> static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp)
> {
> - pte_t *ptep = (pte_t *)pmdp;
> -
> - update_mmu_cache(vma, address, ptep);
> }
>
> #define __HAVE_ARCH_PTE_SAME
> @@ -548,13 +540,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
> unsigned long address, pte_t *ptep,
> pte_t entry, int dirty)
> {
> - if (!pte_same(*ptep, entry))
> + if (!pte_same(*ptep, entry)) {
> __set_pte_at(ptep, entry);
> - /*
> - * update_mmu_cache will unconditionally execute, handling both
> - * the case that the PTE changed and the spurious fault case.
> - */
> - return true;
> + /* Here only not svadu is impacted */
> + flush_tlb_page(vma, address);
> + return true;
> + }
> +
> + return false;
> +}
> +
> +extern u64 nr_sfence_vma_handle_exception;
> +extern bool tlb_caching_invalid_entries;
> +
> +#define flush_tlb_fix_spurious_read_fault flush_tlb_fix_spurious_read_fault
> +static inline void flush_tlb_fix_spurious_read_fault(struct vm_area_struct *vma,
> + unsigned long address,
> + pte_t *ptep)
> +{
> + if (tlb_caching_invalid_entries)
> + flush_tlb_page(vma, address);
> }
>
> #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index af7639c3b0a3..7abaf42ef612 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -931,8 +931,12 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> # define pte_accessible(mm, pte) ((void)(pte), 1)
> #endif
>
> -#ifndef flush_tlb_fix_spurious_fault
> -#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
> +#ifndef flush_tlb_fix_spurious_write_fault
> +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) flush_tlb_page(vma, address)
> +#endif
> +
> +#ifndef flush_tlb_fix_spurious_read_fault
> +#define flush_tlb_fix_spurious_read_fault(vma, address, ptep)
> #endif
>
> /*
> diff --git a/mm/memory.c b/mm/memory.c
> index 517221f01303..5cb0ccf0c03f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5014,8 +5014,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> * with threads.
> */
> if (vmf->flags & FAULT_FLAG_WRITE)
> - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
> - vmf->pte);
> + flush_tlb_fix_spurious_write_fault(vmf->vma, vmf->address,
> + vmf->pte);
> + else
> + /*
> + * With the pte_same(ptep_get(vmf->pte), entry) check
> + * that calls update_mmu_tlb() above, multiple threads
> + * faulting at the same time won't get there.
> + */
> + flush_tlb_fix_spurious_read_fault(vmf->vma, vmf->address,
> + vmf->pte);
> }
> unlock:
> pte_unmap_unlock(vmf->pte, vmf->ptl);