Re: [PATCH 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently

From: Nadav Amit
Date: Tue Jun 25 2019 - 22:35:10 EST


> On Jun 25, 2019, at 2:29 PM, Dave Hansen <dave.hansen@xxxxxxxxx> wrote:
>
> On 6/12/19 11:48 PM, Nadav Amit wrote:
>> To improve TLB shootdown performance, flush the remote and local TLBs
>> concurrently. Introduce flush_tlb_multi() that does so. The current
>> flush_tlb_others() interface is kept, since paravirtual interfaces need
>> to be adapted first before it can be removed. This is left for future
>> work. In such PV environments, TLB flushes are not performed, at this
>> time, concurrently.
>>
>> Add a static key to tell whether this new interface is supported.
>>
>> Cc: "K. Y. Srinivasan" <kys@xxxxxxxxxxxxx>
>> Cc: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>
>> Cc: Stephen Hemminger <sthemmin@xxxxxxxxxxxxx>
>> Cc: Sasha Levin <sashal@xxxxxxxxxx>
>> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
>> Cc: Borislav Petkov <bp@xxxxxxxxx>
>> Cc: x86@xxxxxxxxxx
>> Cc: Juergen Gross <jgross@xxxxxxxx>
>> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
>> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
>> Cc: Andy Lutomirski <luto@xxxxxxxxxx>
>> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>> Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
>> Cc: linux-hyperv@xxxxxxxxxxxxxxx
>> Cc: linux-kernel@xxxxxxxxxxxxxxx
>> Cc: virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
>> Cc: kvm@xxxxxxxxxxxxxxx
>> Cc: xen-devel@xxxxxxxxxxxxxxxxxxxx
>> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
>> ---
>> arch/x86/hyperv/mmu.c | 2 +
>> arch/x86/include/asm/paravirt.h | 8 +++
>> arch/x86/include/asm/paravirt_types.h | 6 +++
>> arch/x86/include/asm/tlbflush.h | 6 +++
>> arch/x86/kernel/kvm.c | 1 +
>> arch/x86/kernel/paravirt.c | 3 ++
>> arch/x86/mm/tlb.c | 71 ++++++++++++++++++++++-----
>> arch/x86/xen/mmu_pv.c | 2 +
>> 8 files changed, 87 insertions(+), 12 deletions(-)
>>
>> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
>> index e65d7fe6489f..ca28b400c87c 100644
>> --- a/arch/x86/hyperv/mmu.c
>> +++ b/arch/x86/hyperv/mmu.c
>> @@ -233,4 +233,6 @@ void hyperv_setup_mmu_ops(void)
>> pr_info("Using hypercall for remote TLB flush\n");
>> pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
>> pv_ops.mmu.tlb_remove_table = tlb_remove_table;
>> +
>> + static_key_disable(&flush_tlb_multi_enabled.key);
>> }
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index c25c38a05c1c..192be7254457 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -47,6 +47,8 @@ static inline void slow_down_io(void)
>> #endif
>> }
>>
>> +DECLARE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
>> +
>> static inline void __flush_tlb(void)
>> {
>> PVOP_VCALL0(mmu.flush_tlb_user);
>> @@ -62,6 +64,12 @@ static inline void __flush_tlb_one_user(unsigned long addr)
>> PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
>> }
>>
>> +static inline void flush_tlb_multi(const struct cpumask *cpumask,
>> + const struct flush_tlb_info *info)
>> +{
>> + PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
>> +}
>> +
>> static inline void flush_tlb_others(const struct cpumask *cpumask,
>> const struct flush_tlb_info *info)
>> {
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
>> index 946f8f1f1efc..b93b3d90729a 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -211,6 +211,12 @@ struct pv_mmu_ops {
>> void (*flush_tlb_user)(void);
>> void (*flush_tlb_kernel)(void);
>> void (*flush_tlb_one_user)(unsigned long addr);
>> + /*
>> + * flush_tlb_multi() is the preferred interface, which is capable to
>> + * flush both local and remote CPUs.
>> + */
>> + void (*flush_tlb_multi)(const struct cpumask *cpus,
>> + const struct flush_tlb_info *info);
>> void (*flush_tlb_others)(const struct cpumask *cpus,
>> const struct flush_tlb_info *info);
>>
>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>> index dee375831962..79272938cf79 100644
>> --- a/arch/x86/include/asm/tlbflush.h
>> +++ b/arch/x86/include/asm/tlbflush.h
>> @@ -569,6 +569,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
>> flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
>> }
>>
>> +void native_flush_tlb_multi(const struct cpumask *cpumask,
>> + const struct flush_tlb_info *info);
>> +
>> void native_flush_tlb_others(const struct cpumask *cpumask,
>> const struct flush_tlb_info *info);
>>
>> @@ -593,6 +596,9 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
>> extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
>>
>> #ifndef CONFIG_PARAVIRT
>> +#define flush_tlb_multi(mask, info) \
>> + native_flush_tlb_multi(mask, info)
>> +
>> #define flush_tlb_others(mask, info) \
>> native_flush_tlb_others(mask, info)
>>
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index 5169b8cc35bb..00d81e898717 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -630,6 +630,7 @@ static void __init kvm_guest_init(void)
>> kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
>> pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
>> pv_ops.mmu.tlb_remove_table = tlb_remove_table;
>> + static_key_disable(&flush_tlb_multi_enabled.key);
>> }
>>
>> if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 98039d7fb998..ac00afed5570 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -159,6 +159,8 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
>> return insn_len;
>> }
>>
>> +DEFINE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
>> +
>> static void native_flush_tlb(void)
>> {
>> __native_flush_tlb();
>> @@ -363,6 +365,7 @@ struct paravirt_patch_template pv_ops = {
>> .mmu.flush_tlb_user = native_flush_tlb,
>> .mmu.flush_tlb_kernel = native_flush_tlb_global,
>> .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
>> + .mmu.flush_tlb_multi = native_flush_tlb_multi,
>> .mmu.flush_tlb_others = native_flush_tlb_others,
>> .mmu.tlb_remove_table =
>> (void (*)(struct mmu_gather *, void *))tlb_remove_page,
>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index c34bcf03f06f..db73d5f1dd43 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -551,7 +551,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>> * garbage into our TLB. Since switching to init_mm is barely
>> * slower than a minimal flush, just switch to init_mm.
>> *
>> - * This should be rare, with native_flush_tlb_others skipping
>> + * This should be rare, with native_flush_tlb_multi skipping
>> * IPIs to lazy TLB mode CPUs.
>> */
>
> Nit, since we're messing with this, it can now be
> "native_flush_tlb_multi()" since it is a function.

Sure.

>
>> switch_mm_irqs_off(NULL, &init_mm, NULL);
>> @@ -635,9 +635,12 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>> this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
>> }
>>
>> -static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
>> +static void flush_tlb_func_local(void *info)
>> {
>> const struct flush_tlb_info *f = info;
>> + enum tlb_flush_reason reason;
>> +
>> + reason = (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : TLB_LOCAL_MM_SHOOTDOWN;
>
> Should we just add the "reason" to flush_tlb_info? It's OK-ish to imply
> it like this, but seems like it would be nicer and easier to track down
> the origins of these things if we did this at the caller.

I prefer not to. I want later to inline flush_tlb_info into the same
cacheline that holds call_function_data. Increasing the size of
flush_tlb_info for no good reason will not helpâ

>> flush_tlb_func_common(f, true, reason);
>> }
>> @@ -655,14 +658,21 @@ static void flush_tlb_func_remote(void *info)
>> flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
>> }
>>
>> -static bool tlb_is_not_lazy(int cpu, void *data)
>> +static inline bool tlb_is_not_lazy(int cpu)
>> {
>> return !per_cpu(cpu_tlbstate.is_lazy, cpu);
>> }
>
> Nit: the compiler will probably inline this sucker anyway. So, for
> these kinds of patches, I'd resist the urge to do these kinds of tweaks,
> especially since it starts to hide the important change on the line.

Of course.

>
>> -void native_flush_tlb_others(const struct cpumask *cpumask,
>> - const struct flush_tlb_info *info)
>> +static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
>> +
>> +void native_flush_tlb_multi(const struct cpumask *cpumask,
>> + const struct flush_tlb_info *info)
>> {
>> + /*
>> + * Do accounting and tracing. Note that there are (and have always been)
>> + * cases in which a remote TLB flush will be traced, but eventually
>> + * would not happen.
>> + */
>> count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
>> if (info->end == TLB_FLUSH_ALL)
>> trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
>> @@ -682,10 +692,14 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>> * means that the percpu tlb_gen variables won't be updated
>> * and we'll do pointless flushes on future context switches.
>> *
>> - * Rather than hooking native_flush_tlb_others() here, I think
>> + * Rather than hooking native_flush_tlb_multi() here, I think
>> * that UV should be updated so that smp_call_function_many(),
>> * etc, are optimal on UV.
>> */
>> + local_irq_disable();
>> + flush_tlb_func_local((__force void *)info);
>> + local_irq_enable();
>> +
>> cpumask = uv_flush_tlb_others(cpumask, info);
>> if (cpumask)
>> smp_call_function_many(cpumask, flush_tlb_func_remote,
>> @@ -704,11 +718,39 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>> * doing a speculative memory access.
>> */
>> if (info->freed_tables)
>> - smp_call_function_many(cpumask, flush_tlb_func_remote,
>> - (void *)info, 1);
>> - else
>> - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
>> - (void *)info, 1, GFP_ATOMIC, cpumask);
>> + __smp_call_function_many(cpumask, flush_tlb_func_remote,
>> + flush_tlb_func_local, (void *)info, 1);
>> + else {
>
> I prefer brackets be added for 'if' blocks like this since it doesn't
> take up any meaningful space and makes it less prone to compile errors.

If you say so.

>
>> + /*
>> + * Although we could have used on_each_cpu_cond_mask(),
>> + * open-coding it has several performance advantages: (1) we can
>> + * use specialized functions for remote and local flushes; (2)
>> + * no need for indirect branch to test if TLB is lazy; (3) we
>> + * can use a designated cpumask for evaluating the condition
>> + * instead of allocating a new one.
>> + *
>> + * This works under the assumption that there are no nested TLB
>> + * flushes, an assumption that is already made in
>> + * flush_tlb_mm_range().
>> + */
>> + struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
>
> This is logically a stack-local variable, right? But, since we've got
> preempt off and cpumasks can be huge, we don't want to allocate it on
> the stack. That might be worth a comment somewhere.

I will add a comment here.

>
>> + int cpu;
>> +
>> + cpumask_clear(cond_cpumask);
>> +
>> + for_each_cpu(cpu, cpumask) {
>> + if (tlb_is_not_lazy(cpu))
>> + __cpumask_set_cpu(cpu, cond_cpumask);
>> + }
>
> FWIW, it's probably worth calling out in the changelog that this loop
> exists in on_each_cpu_cond_mask() too. It looks bad here, but it's no
> worse than what it replaces.

Added.

>
>> + __smp_call_function_many(cond_cpumask, flush_tlb_func_remote,
>> + flush_tlb_func_local, (void *)info, 1);
>> + }
>> +}
>
> There was a __force on an earlier 'info' cast. Could you talk about
> that for a minute an explain why that one is needed?

I have no idea where the __force came from. Iâll remove it.

>
>> +void native_flush_tlb_others(const struct cpumask *cpumask,
>> + const struct flush_tlb_info *info)
>> +{
>> + native_flush_tlb_multi(cpumask, info);
>> }
>>
>> /*
>> @@ -774,10 +816,15 @@ static void flush_tlb_on_cpus(const cpumask_t *cpumask,
>> {
>> int this_cpu = smp_processor_id();
>>
>> + if (static_branch_likely(&flush_tlb_multi_enabled)) {
>> + flush_tlb_multi(cpumask, info);
>> + return;
>> + }
>
> Probably needs a comment for posterity above the if()^^:
>
> /* Use the optimized flush_tlb_multi() where we can. */

Right.

>
>> --- a/arch/x86/xen/mmu_pv.c
>> +++ b/arch/x86/xen/mmu_pv.c
>> @@ -2474,6 +2474,8 @@ void __init xen_init_mmu_ops(void)
>>
>> pv_ops.mmu = xen_mmu_ops;
>>
>> + static_key_disable(&flush_tlb_multi_enabled.key);
>> +
>> memset(dummy_mapping, 0xff, PAGE_SIZE);
>> }
>
> More comments, please. Perhaps:
>
> Existing paravirt TLB flushes are incompatible with
> flush_tlb_multi() because.... Disable it when they are
> in use.

There is no inherent reason for them to be incompatible. Someone needs to
adapt them. I will use my affiliation as an excuse for the question âwhy
donât you do it?â ;-)

Anyhow, I will add a comment.