Re: [PATCH] x86: Align TLB invalidation info
From: Andy Lutomirski
Date: Wed Jan 31 2018 - 15:24:53 EST
> On Jan 31, 2018, at 12:11 PM, Nadav Amit <namit@xxxxxxxxxx> wrote:
>
> The TLB invalidation info is allocated on the stack, which might cause
> it to be unaligned. Since this information may be transferred to
> different cores for TLB shootdown, this might result in an additional
> cache-line bouncing between the cores.
>
> GCC provides a way to deal with it by using
> __builtin_alloca_with_align(). Use it to avoid the bouncing cache lines.
>
Eww. How about __aligned?
> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
>
> Cc: Andy Lutomirski <luto@xxxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> ---
> arch/x86/mm/tlb.c | 21 +++++++++++----------
> include/linux/compiler-gcc.h | 5 +++++
> include/linux/compiler_types.h | 4 ++++
> 3 files changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 5bfe61a5e8e3..bab7bb5d982f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -574,37 +574,38 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
> void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
> unsigned long end, unsigned long vmflag)
> {
> + struct flush_tlb_info *info;
> int cpu;
>
> - struct flush_tlb_info info = {
> - .mm = mm,
> - };
> + info = __alloca_with_align(sizeof(*info),
> + SMP_CACHE_BYTES * BITS_PER_BYTE);
> + info->mm = mm;
>
> cpu = get_cpu();
>
> /* This is also a barrier that synchronizes with switch_mm(). */
> - info.new_tlb_gen = inc_mm_tlb_gen(mm);
> + info->new_tlb_gen = inc_mm_tlb_gen(mm);
>
> /* Should we flush just the requested range? */
> if ((end != TLB_FLUSH_ALL) &&
> !(vmflag & VM_HUGETLB) &&
> ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
> - info.start = start;
> - info.end = end;
> + info->start = start;
> + info->end = end;
> } else {
> - info.start = 0UL;
> - info.end = TLB_FLUSH_ALL;
> + info->start = 0UL;
> + info->end = TLB_FLUSH_ALL;
> }
>
> if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
> VM_WARN_ON(irqs_disabled());
> local_irq_disable();
> - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
> + flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
> local_irq_enable();
> }
>
> if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
> - flush_tlb_others(mm_cpumask(mm), &info);
> + flush_tlb_others(mm_cpumask(mm), info);
>
> put_cpu();
> }
> diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
> index 631354acfa72..aea9a2e69417 100644
> --- a/include/linux/compiler-gcc.h
> +++ b/include/linux/compiler-gcc.h
> @@ -314,6 +314,11 @@
> #define __designated_init __attribute__((designated_init))
> #endif
>
> +#if GCC_VERSION >= 60100
> +#define __alloca_with_align(size, alignment) \
> + __builtin_alloca_with_align(size, alignment)
> +#endif
> +
> #endif /* gcc version >= 40000 specific checks */
>
> #if !defined(__noclone)
> diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
> index 6b79a9bba9a7..c71297d95c74 100644
> --- a/include/linux/compiler_types.h
> +++ b/include/linux/compiler_types.h
> @@ -271,4 +271,8 @@ struct ftrace_likely_data {
> # define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
> #endif
>
> +#ifndef __alloca_with_align
> +#define __alloca_with_align(size, alignment) __builtin_alloca(size)
> +#endif
> +
> #endif /* __LINUX_COMPILER_TYPES_H */
> --
> 2.14.1
>