Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack
From: Chuyi Zhou
Date: Fri Mar 20 2026 - 10:35:34 EST
Hello,
On 2026-03-19 1:21 a.m., Sebastian Andrzej Siewior wrote:
> +Nadav, org post https://lore.kernel.org/all/20260318045638.1572777-11-zhouchuyi@xxxxxxxxxxxxx/
>
> On 2026-03-18 12:56:36 [+0800], Chuyi Zhou wrote:
>> Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
>> stack") converted flush_tlb_info from stack variable to per-CPU variable.
>> This brought about a performance improvement of around 3% in extreme test.
>> However, it also required that all flush_tlb* operations keep preemption
>> disabled entirely to prevent concurrent modifications of flush_tlb_info.
>> flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
>> all remote CPUs to complete their local TLB flushes. The process could
>> take tens of milliseconds when interrupts are disabled or with a large
>> number of remote CPUs.
> …
> PeterZ wasn't too happy to reverse this.
> The snippet below results in the following assembly:
>
> | 0000000000001ab0 <flush_tlb_kernel_range>:
> …
> | 1ac9: 48 89 e5 mov %rsp,%rbp
> | 1acc: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
> | 1ad0: 48 83 ec 40 sub $0x40,%rsp
>
> so it would align it properly which should result in the same cache-line
> movement. I'm not sure about the virtual-to-physical translation of the
> variables as in TLB misses since here we have a virtual mapped stack and
> there we have virtual mapped per-CPU memory.
>
> Here the below is my quick hack. Does this work, or still a now? I have
> no numbers so…
>
I applied this patch on tip sched/core: fe7171d0d5df ("sched/fair:
Simplify SIS_UTIL handling in select_idle_cpu()") and retest the
microbenchmark in [PATCH v3 10/12], pinning the tasks to CPUs using
cpuset, and disabling pti and mitigations.
base on-stack-aligned on-stack
---- --------- -----------
avg (usec/op) 2.5278 2.5261 2.5508
stddev 0.0007 0.0027 0.0023
Based on the data above, there is no performance regression.
Dose this make sense? Do you have other testing suggestions?
Thanks
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 5a3cdc439e38d..4a7f40c7f939a 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -227,7 +227,7 @@ struct flush_tlb_info {
> u8 stride_shift;
> u8 freed_tables;
> u8 trim_cpumask;
> -};
> +} __aligned(SMP_CACHE_BYTES);
>
> void flush_tlb_local(void);
> void flush_tlb_one_user(unsigned long addr);
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 621e09d049cb9..99b70e94ec281 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1394,28 +1394,12 @@ void flush_tlb_multi(const struct cpumask *cpumask,
> */
> unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>
> -static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
> -
> -#ifdef CONFIG_DEBUG_VM
> -static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
> -#endif
> -
> -static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> - unsigned long start, unsigned long end,
> - unsigned int stride_shift, bool freed_tables,
> - u64 new_tlb_gen)
> +static void get_flush_tlb_info(struct flush_tlb_info *info,
> + struct mm_struct *mm,
> + unsigned long start, unsigned long end,
> + unsigned int stride_shift, bool freed_tables,
> + u64 new_tlb_gen)
> {
> - struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
> -
> -#ifdef CONFIG_DEBUG_VM
> - /*
> - * Ensure that the following code is non-reentrant and flush_tlb_info
> - * is not overwritten. This means no TLB flushing is initiated by
> - * interrupt handlers and machine-check exception handlers.
> - */
> - BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> -#endif
> -
> /*
> * If the number of flushes is so large that a full flush
> * would be faster, do a full flush.
> @@ -1433,8 +1417,6 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> info->new_tlb_gen = new_tlb_gen;
> info->initiating_cpu = smp_processor_id();
> info->trim_cpumask = 0;
> -
> - return info;
> }
>
> static void put_flush_tlb_info(void)
> @@ -1450,15 +1432,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
> unsigned long end, unsigned int stride_shift,
> bool freed_tables)
> {
> - struct flush_tlb_info *info;
> + struct flush_tlb_info _info;
> + struct flush_tlb_info *info = &_info;
> int cpu = get_cpu();
> u64 new_tlb_gen;
>
> /* This is also a barrier that synchronizes with switch_mm(). */
> new_tlb_gen = inc_mm_tlb_gen(mm);
>
> - info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
> - new_tlb_gen);
> + get_flush_tlb_info(&_info, mm, start, end, stride_shift, freed_tables,
> + new_tlb_gen);
>
> /*
> * flush_tlb_multi() is not optimized for the common case in which only
> @@ -1548,17 +1531,15 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)
>
> void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> {
> - struct flush_tlb_info *info;
> + struct flush_tlb_info info;
>
> - guard(preempt)();
> + get_flush_tlb_info(&info, NULL, start, end, PAGE_SHIFT, false,
> + TLB_GENERATION_INVALID);
>
> - info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
> - TLB_GENERATION_INVALID);
> -
> - if (info->end == TLB_FLUSH_ALL)
> - kernel_tlb_flush_all(info);
> + if (info.end == TLB_FLUSH_ALL)
> + kernel_tlb_flush_all(&info);
> else
> - kernel_tlb_flush_range(info);
> + kernel_tlb_flush_range(&info);
>
> put_flush_tlb_info();
> }
> @@ -1728,12 +1709,11 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
>
> void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
> {
> - struct flush_tlb_info *info;
> + struct flush_tlb_info info;
>
> int cpu = get_cpu();
> -
> - info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
> - TLB_GENERATION_INVALID);
> + get_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
> + TLB_GENERATION_INVALID);
> /*
> * flush_tlb_multi() is not optimized for the common case in which only
> * a local TLB flush is needed. Optimize this use-case by calling
> @@ -1743,11 +1723,11 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
> invlpgb_flush_all_nonglobals();
> batch->unmapped_pages = false;
> } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> - flush_tlb_multi(&batch->cpumask, info);
> + flush_tlb_multi(&batch->cpumask, &info);
> } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
> lockdep_assert_irqs_enabled();
> local_irq_disable();
> - flush_tlb_func(info);
> + flush_tlb_func(&info);
> local_irq_enable();
> }
>