Re: [PATCH 2/2] arm64: tlb: skip tlbi broadcast for single threaded TLB flushes

From: qi.fuli@xxxxxxxxxxx
Date: Wed Feb 12 2020 - 09:21:14 EST


On 2/4/20 5:17 AM, Andrea Arcangeli wrote:
> With multiple NUMA nodes and multiple sockets, the tlbi broadcast
> shall be delivered through the interconnects in turn increasing the
> interconnect traffic and the latency of the tlbi broadcast instruction.
>
> Even within a single NUMA node the latency of the tlbi broadcast
> instruction increases almost linearly with the number of CPUs trying to
> send tlbi broadcasts at the same time.
>
> When the process is single threaded however we can achieve full SMP
> scalability by skipping the tlbi broadcasting. Other arches already
> deploy this optimization.
>
> After the local TLB flush this however means the ASID context goes out
> of sync in all CPUs except the local one. This can be tracked in the
> mm_cpumask(mm): if the bit is set it means the asid context is stale
> for that CPU. This results in an extra local ASID TLB flush only if a
> single threaded process is migrated to a different CPU and only after a
> TLB flush. No extra local TLB flush is needed for the common case of
> single threaded processes context scheduling within the same CPU and for
> multithreaded processes.
>
> Skipping the tlbi instruction broadcasting is already implemented in
> local_flush_tlb_all(), this patch only extends it to flush_tlb_mm(),
> flush_tlb_range() and flush_tlb_page() too.
>
> Here's the result of 32 CPUs (ARMv8 Ampere) running mprotect at the same
> time from 32 single threaded processes before the patch:
>
> Performance counter stats for './loop' (3 runs):
>
> 0 dummy
>
> 2.121353 +- 0.000387 seconds time elapsed ( +- 0.02% )
>
> and with the patch applied:
>
> Performance counter stats for './loop' (3 runs):
>
> 0 dummy
>
> 0.1197750 +- 0.0000827 seconds time elapsed ( +- 0.07% )

Hi,

I have tested this patch on thunderX2 with Himeno benchmark[1] with
LARGE calculation size. Here are the results.

w/o patch: MFLOPS : 1149.480174
w/ patch: MFLOPS : 1110.653003

In order to validate the effectivness of the patch, I ran a
single-threded program, which calls mprotect() in a loop to issue the
tlbi broadcast instruction on a CPU core. At the same time, I ran Himeno
benchmark on another CPU core. The results are:

w/o patch: MFLOPS : 860.238792
w/ patch: MFLOPS : 1110.449666

Though Himeno benchmark is a microbenchmark, I hope it helps.

[1] http://accc.riken.jp/en/supercom/documents/himenobmt/

Thanks,
QI Fuli

>
> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> ---
> arch/arm64/include/asm/efi.h | 2 +-
> arch/arm64/include/asm/mmu.h | 3 +-
> arch/arm64/include/asm/mmu_context.h | 10 +--
> arch/arm64/include/asm/tlbflush.h | 91 +++++++++++++++++++++++++++-
> arch/arm64/mm/context.c | 13 +++-
> 5 files changed, 109 insertions(+), 10 deletions(-)
>
> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
> index 44531a69d32b..5d9a1433d918 100644
> --- a/arch/arm64/include/asm/efi.h
> +++ b/arch/arm64/include/asm/efi.h
> @@ -131,7 +131,7 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
>
> static inline void efi_set_pgd(struct mm_struct *mm)
> {
> - __switch_mm(mm);
> + __switch_mm(mm, smp_processor_id());
>
> if (system_uses_ttbr0_pan()) {
> if (mm != current->active_mm) {
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index e4d862420bb4..1f84289d3e44 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -27,7 +27,8 @@ typedef struct {
> * ASID change and therefore doesn't need to reload the counter using
> * atomic64_read.
> */
> -#define ASID(mm) ((mm)->context.id.counter & 0xffff)
> +#define __ASID(asid) ((asid) & 0xffff)
> +#define ASID(mm) __ASID((mm)->context.id.counter)
>
> extern bool arm64_use_ng_mappings;
>
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index 3827ff4040a3..5ec264297968 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -210,10 +210,8 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
> update_saved_ttbr0(tsk, &init_mm);
> }
>
> -static inline void __switch_mm(struct mm_struct *next)
> +static inline void __switch_mm(struct mm_struct *next, unsigned int cpu)
> {
> - unsigned int cpu = smp_processor_id();
> -
> /*
> * init_mm.pgd does not contain any user mappings and it is always
> * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
> @@ -230,8 +228,12 @@ static inline void
> switch_mm(struct mm_struct *prev, struct mm_struct *next,
> struct task_struct *tsk)
> {
> + unsigned int cpu = smp_processor_id();
> +
> if (prev != next)
> - __switch_mm(next);
> + __switch_mm(next, cpu);
> + else if (cpumask_test_and_clear_cpu(cpu, mm_cpumask(next)))
> + local_flush_tlb_asid(atomic64_read(&next->context.id));
>
> /*
> * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index bc3949064725..283f97af3fc5 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -136,6 +136,15 @@ static inline void local_flush_tlb_all(void)
> isb();
> }
>
> +static inline void local_flush_tlb_asid(unsigned long asid)
> +{
> + asid = __TLBI_VADDR(0, __ASID(asid));
> + dsb(nshst);
> + __tlbi(aside1, asid);
> + __tlbi_user(aside1, asid);
> + dsb(nsh);
> +}
> +
> static inline void flush_tlb_all(void)
> {
> dsb(ishst);
> @@ -148,6 +157,27 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
> {
> unsigned long asid = __TLBI_VADDR(0, ASID(mm));
>
> + /* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */
> + if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) {
> + int cpu = get_cpu();
> +
> + cpumask_setall(mm_cpumask(mm));
> + cpumask_clear_cpu(cpu, mm_cpumask(mm));
> +
> + smp_mb();
> +
> + if (atomic_read(&mm->mm_users) <= 1) {
> + dsb(nshst);
> + __tlbi(aside1, asid);
> + __tlbi_user(aside1, asid);
> + dsb(nsh);
> +
> + put_cpu();
> + return;
> + }
> + put_cpu();
> + }
> +
> dsb(ishst);
> __tlbi(aside1is, asid);
> __tlbi_user(aside1is, asid);
> @@ -167,7 +197,33 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
> static inline void flush_tlb_page(struct vm_area_struct *vma,
> unsigned long uaddr)
> {
> - flush_tlb_page_nosync(vma, uaddr);
> + struct mm_struct *mm = vma->vm_mm;
> + unsigned long addr = __TLBI_VADDR(uaddr, ASID(mm));
> +
> + /* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */
> + if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) {
> + int cpu = get_cpu();
> +
> + cpumask_setall(mm_cpumask(mm));
> + cpumask_clear_cpu(cpu, mm_cpumask(mm));
> +
> + smp_mb();
> +
> + if (atomic_read(&mm->mm_users) <= 1) {
> + dsb(nshst);
> + __tlbi(vale1, addr);
> + __tlbi_user(vale1, addr);
> + dsb(nsh);
> +
> + put_cpu();
> + return;
> + }
> + put_cpu();
> + }
> +
> + dsb(ishst);
> + __tlbi(vale1is, addr);
> + __tlbi_user(vale1is, addr);
> dsb(ish);
> }
>
> @@ -181,14 +237,15 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> unsigned long stride, bool last_level)
> {
> - unsigned long asid = ASID(vma->vm_mm);
> + struct mm_struct *mm = vma->vm_mm;
> + unsigned long asid = ASID(mm);
> unsigned long addr;
>
> start = round_down(start, stride);
> end = round_up(end, stride);
>
> if ((end - start) >= (MAX_TLBI_OPS * stride)) {
> - flush_tlb_mm(vma->vm_mm);
> + flush_tlb_mm(mm);
> return;
> }
>
> @@ -198,6 +255,34 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
> start = __TLBI_VADDR(start, asid);
> end = __TLBI_VADDR(end, asid);
>
> + /* avoid TLB-i broadcast to remote NUMA nodes if it's a local flush */
> + if (current->mm == mm && atomic_read(&mm->mm_users) <= 1) {
> + int cpu = get_cpu();
> +
> + cpumask_setall(mm_cpumask(mm));
> + cpumask_clear_cpu(cpu, mm_cpumask(mm));
> +
> + smp_mb();
> +
> + if (atomic_read(&mm->mm_users) <= 1) {
> + dsb(nshst);
> + for (addr = start; addr < end; addr += stride) {
> + if (last_level) {
> + __tlbi(vale1, addr);
> + __tlbi_user(vale1, addr);
> + } else {
> + __tlbi(vae1, addr);
> + __tlbi_user(vae1, addr);
> + }
> + }
> + dsb(nsh);
> +
> + put_cpu();
> + return;
> + }
> + put_cpu();
> + }
> +
> dsb(ishst);
> for (addr = start; addr < end; addr += stride) {
> if (last_level) {
> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> index 8ef73e89d514..b44459d64dff 100644
> --- a/arch/arm64/mm/context.c
> +++ b/arch/arm64/mm/context.c
> @@ -198,6 +198,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> {
> unsigned long flags;
> u64 asid, old_active_asid;
> + int need_flush = 0;
>
> if (system_supports_cnp())
> cpu_set_reserved_ttbr0();
> @@ -233,7 +234,8 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> atomic64_set(&mm->context.id, asid);
> }
>
> - if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
> + need_flush = cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending);
> + if (need_flush)
> local_flush_tlb_all();
>
> atomic64_set(&per_cpu(active_asids, cpu), asid);
> @@ -241,6 +243,15 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>
> switch_mm_fastpath:
>
> + /*
> + * Enforce CPU ordering between the mmget() in use_mm() and
> + * the below cpumask_test_and_clear_cpu().
> + */
> + smp_mb();
> + if (cpumask_test_and_clear_cpu(cpu, mm_cpumask(mm)) &&
> + likely(!need_flush))
> + local_flush_tlb_asid(asid);
> +
> arm64_apply_bp_hardening();
>
> /*
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@xxxxxxxxxxxxxxxxxxx
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>