Re: [PATCH v3 10/12] x86/mm: Move flush_tlb_info back to the stack

From: Sebastian Andrzej Siewior

Date: Wed Mar 18 2026 - 14:22:28 EST

+Nadav, org post https://lore.kernel.org/all/20260318045638.1572777-11-zhouchuyi@xxxxxxxxxxxxx/

On 2026-03-18 12:56:36 [+0800], Chuyi Zhou wrote:
> Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
> stack") converted flush_tlb_info from stack variable to per-CPU variable.
> This brought about a performance improvement of around 3% in extreme test.
> However, it also required that all flush_tlb* operations keep preemption
> disabled entirely to prevent concurrent modifications of flush_tlb_info.
> flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for
> all remote CPUs to complete their local TLB flushes. The process could
> take tens of milliseconds when interrupts are disabled or with a large
> number of remote CPUs.
…
PeterZ wasn't too happy to reverse this.
The snippet below results in the following assembly:

| 0000000000001ab0 <flush_tlb_kernel_range>:
…
| 1ac9: 48 89 e5 mov %rsp,%rbp
| 1acc: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
| 1ad0: 48 83 ec 40 sub $0x40,%rsp

so it would align it properly which should result in the same cache-line
movement. I'm not sure about the virtual-to-physical translation of the
variables as in TLB misses since here we have a virtual mapped stack and
there we have virtual mapped per-CPU memory.

Here the below is my quick hack. Does this work, or still a now? I have
no numbers so…

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5a3cdc439e38d..4a7f40c7f939a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -227,7 +227,7 @@ struct flush_tlb_info {
u8 stride_shift;
u8 freed_tables;
u8 trim_cpumask;
-};
+} __aligned(SMP_CACHE_BYTES);

void flush_tlb_local(void);
void flush_tlb_one_user(unsigned long addr);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 621e09d049cb9..99b70e94ec281 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1394,28 +1394,12 @@ void flush_tlb_multi(const struct cpumask *cpumask,
*/
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;

-static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
-#endif
-
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- unsigned int stride_shift, bool freed_tables,
- u64 new_tlb_gen)
+static void get_flush_tlb_info(struct flush_tlb_info *info,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned int stride_shift, bool freed_tables,
+ u64 new_tlb_gen)
{
- struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
- /*
- * Ensure that the following code is non-reentrant and flush_tlb_info
- * is not overwritten. This means no TLB flushing is initiated by
- * interrupt handlers and machine-check exception handlers.
- */
- BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
-#endif
-
/*
* If the number of flushes is so large that a full flush
* would be faster, do a full flush.
@@ -1433,8 +1417,6 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
info->trim_cpumask = 0;
-
- return info;
}

static void put_flush_tlb_info(void)
@@ -1450,15 +1432,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
- struct flush_tlb_info *info;
+ struct flush_tlb_info _info;
+ struct flush_tlb_info *info = &_info;
int cpu = get_cpu();
u64 new_tlb_gen;

/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);

- info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
- new_tlb_gen);
+ get_flush_tlb_info(&_info, mm, start, end, stride_shift, freed_tables,
+ new_tlb_gen);

/*
* flush_tlb_multi() is not optimized for the common case in which only
@@ -1548,17 +1531,15 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)

void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- struct flush_tlb_info *info;
+ struct flush_tlb_info info;

- guard(preempt)();
+ get_flush_tlb_info(&info, NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);

- info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
- TLB_GENERATION_INVALID);
-
- if (info->end == TLB_FLUSH_ALL)
- kernel_tlb_flush_all(info);
+ if (info.end == TLB_FLUSH_ALL)
+ kernel_tlb_flush_all(&info);
else
- kernel_tlb_flush_range(info);
+ kernel_tlb_flush_range(&info);

put_flush_tlb_info();
}
@@ -1728,12 +1709,11 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);

void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
- struct flush_tlb_info *info;
+ struct flush_tlb_info info;

int cpu = get_cpu();
-
- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
- TLB_GENERATION_INVALID);
+ get_flush_tlb_info(&info, NULL, 0, TLB_FLUSH_ALL, 0, false,
+ TLB_GENERATION_INVALID);
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling
@@ -1743,11 +1723,11 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
invlpgb_flush_all_nonglobals();
batch->unmapped_pages = false;
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
- flush_tlb_multi(&batch->cpumask, info);
+ flush_tlb_multi(&batch->cpumask, &info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func(info);
+ flush_tlb_func(&info);
local_irq_enable();
}