Re: [PATCH v4 09/12] x86/mm: enable broadcast TLB invalidation for multi-threaded processes

From: Nadav Amit
Date: Mon Jan 13 2025 - 08:10:11 EST




Not sure my review is thorough, but that’s all the time I have right now...

> On 12 Jan 2025, at 17:53, Rik van Riel <riel@xxxxxxxxxxx> wrote:
>
> Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
> and newer CPUs.
>
> In order to not exhaust PCID space, and keep TLB flushes local for single
> threaded processes, we only hand out broadcast ASIDs to processes active on
> 3 or more CPUs, and gradually increase the threshold as broadcast ASID space
> is depleted.
>
> Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
> ---
> arch/x86/include/asm/mmu.h | 6 +
> arch/x86/include/asm/mmu_context.h | 14 ++
> arch/x86/include/asm/tlbflush.h | 64 +++++
> arch/x86/mm/tlb.c | 363 ++++++++++++++++++++++++++++-
> 4 files changed, 435 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 3b496cdcb74b..d71cd599fec4 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -69,6 +69,12 @@ typedef struct {
> u16 pkey_allocation_map;
> s16 execute_only_pkey;
> #endif
> +
> +#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
> + u16 global_asid;
> + bool asid_transition;

As I later note, there are various ordering issues between the two. Would it be
just easier to combine them into one field? I know everybody hates bitfields so
I don’t suggest it, but there are other ways...

> +#endif
> +
> } mm_context_t;
>
> #define INIT_MM_CONTEXT(mm) \
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 795fdd53bd0a..d670699d32c2 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
> #define enter_lazy_tlb enter_lazy_tlb
> extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
>
> +extern void destroy_context_free_global_asid(struct mm_struct *mm);
> +
> /*
> * Init a new mm. Used on mm copies, like at fork()
> * and on mm's that are brand-new, like at execve().
> @@ -161,6 +163,14 @@ static inline int init_new_context(struct task_struct *tsk,
> mm->context.execute_only_pkey = -1;
> }
> #endif
> +
> +#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
> + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
> + mm->context.global_asid = 0;
> + mm->context.asid_transition = false;
> + }
> +#endif
> +
> mm_reset_untag_mask(mm);
> init_new_context_ldt(mm);
> return 0;
> @@ -170,6 +180,10 @@ static inline int init_new_context(struct task_struct *tsk,
> static inline void destroy_context(struct mm_struct *mm)
> {
> destroy_context_ldt(mm);
> +#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH

I’d prefer to use IS_ENABLED() and to have a stub for
destroy_context_free_global_asid().

> + if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
> + destroy_context_free_global_asid(mm);
> +#endif
> }
>
> extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index dba5caa4a9f4..cd244cdd49dd 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -239,6 +239,70 @@ void flush_tlb_one_kernel(unsigned long addr);
> void flush_tlb_multi(const struct cpumask *cpumask,
> const struct flush_tlb_info *info);
>
> +#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
> +static inline bool is_dyn_asid(u16 asid)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
> + return true;
> +
> + return asid < TLB_NR_DYN_ASIDS;
> +}
> +
> +static inline bool is_global_asid(u16 asid)
> +{
> + return !is_dyn_asid(asid);
> +}
> +
> +static inline bool in_asid_transition(const struct flush_tlb_info *info)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
> + return false;
> +
> + return info->mm && info->mm->context.asid_transition;

READ_ONCE(context.asid_transition) ?

> +}
> +
> +static inline u16 mm_global_asid(struct mm_struct *mm)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
> + return 0;
> +
> + return mm->context.global_asid;
> +}
> +#else
> +static inline bool is_dyn_asid(u16 asid)
> +{
> + return true;
> +}
> +
> +static inline bool is_global_asid(u16 asid)
> +{
> + return false;
> +}
> +
> +static inline bool in_asid_transition(const struct flush_tlb_info *info)
> +{
> + return false;
> +}
> +
> +static inline u16 mm_global_asid(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> +static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
> +{
> + return false;
> +}
> +
> +static inline void broadcast_tlb_flush(struct flush_tlb_info *info)
> +{

Having a VM_WARN_ON() here might be nice.

> +}
> +
> +static inline void consider_global_asid(struct mm_struct *mm)
> +{
> +}
> +#endif
> +
> #ifdef CONFIG_PARAVIRT
> #include <asm/paravirt.h>
> #endif
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index b47d6c3fe0af..80375ef186d5 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -74,13 +74,15 @@
> * use different names for each of them:
> *
> * ASID - [0, TLB_NR_DYN_ASIDS-1]
> - * the canonical identifier for an mm
> + * the canonical identifier for an mm, dynamically allocated on each CPU
> + * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
> + * the canonical, global identifier for an mm, identical across all CPUs
> *
> - * kPCID - [1, TLB_NR_DYN_ASIDS]
> + * kPCID - [1, MAX_ASID_AVAILABLE]
> * the value we write into the PCID part of CR3; corresponds to the
> * ASID+1, because PCID 0 is special.
> *
> - * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
> + * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
> * for KPTI each mm has two address spaces and thus needs two
> * PCID values, but we can still do with a single ASID denomination
> * for each mm. Corresponds to kPCID + 2048.
> @@ -225,6 +227,19 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
> return;
> }
>
> + /*
> + * TLB consistency for global ASIDs is maintained with broadcast TLB
> + * flushing. The TLB is never outdated, and does not need flushing.
> + */
> + if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) {
> + u16 global_asid = mm_global_asid(next);
> + if (global_asid) {
> + *new_asid = global_asid;
> + *need_flush = false;
> + return;
> + }
> + }
> +
> if (this_cpu_read(cpu_tlbstate.invalidate_other))
> clear_asid_other();
>
> @@ -251,6 +266,292 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
> *need_flush = true;
> }
>
> +#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
> +/*
> + * Logic for broadcast TLB invalidation.
> + */
> +static DEFINE_RAW_SPINLOCK(global_asid_lock);
> +static u16 last_global_asid = MAX_ASID_AVAILABLE;
> +static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 };
> +static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 };
> +static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
> +
> +static void reset_global_asid_space(void)
> +{
> + lockdep_assert_held(&global_asid_lock);
> +
> + /*
> + * A global TLB flush guarantees that any stale entries from
> + * previously freed global ASIDs get flushed from the TLB
> + * everywhere, making these global ASIDs safe to reuse.
> + */
> + invlpgb_flush_all_nonglobals();
> +
> + /*
> + * Clear all the previously freed global ASIDs from the
> + * broadcast_asid_used bitmap, now that the global TLB flush
> + * has made them actually available for re-use.
> + */
> + bitmap_andnot(global_asid_used, global_asid_used,
> + global_asid_freed, MAX_ASID_AVAILABLE);
> + bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
> +
> + /*
> + * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID
> + * assignments, for tasks doing IPI based TLB shootdowns.
> + * Restart the search from the start of the global ASID space.
> + */
> + last_global_asid = TLB_NR_DYN_ASIDS;
> +}
> +
> +static u16 get_global_asid(void)
> +{
> + lockdep_assert_held(&global_asid_lock);
> +
> + do {
> + u16 start = last_global_asid;
> + u16 asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, start);
> +
> + if (asid >= MAX_ASID_AVAILABLE) {
> + reset_global_asid_space();
> + continue;
> + }
> +
> + /* Claim this global ASID. */
> + __set_bit(asid, global_asid_used);
> + last_global_asid = asid;
> + return asid;
> + } while (1);

This does not make me feel easy at all. I do not understand
why it might happen. The caller should’ve already checked the global ASID
is available under the lock. If it is not obvious from the code, perhaps
refactoring is needed.

> +}
> +
> +/*
> + * Returns true if the mm is transitioning from a CPU-local ASID to a global
> + * (INVLPGB) ASID, or the other way around.
> + */
> +static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
> +{
> + u16 global_asid = mm_global_asid(next);
> +
> + if (global_asid && prev_asid != global_asid)
> + return true;
> +
> + if (!global_asid && is_global_asid(prev_asid))
> + return true;
> +
> + return false;
> +}
> +
> +void destroy_context_free_global_asid(struct mm_struct *mm)
> +{
> + if (!mm->context.global_asid)
> + return;
> +
> + guard(raw_spinlock_irqsave)(&global_asid_lock);
> +
> + /* The global ASID can be re-used only after flush at wrap-around. */
> + __set_bit(mm->context.global_asid, global_asid_freed);
> +
> + mm->context.global_asid = 0;
> + global_asid_available++;
> +}
> +
> +/*
> + * Check whether a process is currently active on more than "threshold" CPUs.
> + * This is a cheap estimation on whether or not it may make sense to assign
> + * a global ASID to this process, and use broadcast TLB invalidation.
> + */
> +static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
> +{
> + int count = 0;
> + int cpu;
> +
> + /* This quick check should eliminate most single threaded programs. */
> + if (cpumask_weight(mm_cpumask(mm)) <= threshold)
> + return false;
> +
> + /* Slower check to make sure. */
> + for_each_cpu(cpu, mm_cpumask(mm)) {
> + /* Skip the CPUs that aren't really running this process. */
> + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
> + continue;

Do you really want to make loaded_mm accessed from other cores? Does this
really provide worthy benefit?

Why not just use cpumask_weight() and be done with it? Anyhow it’s a heuristic.

> +
> + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
> + continue;
> +
> + if (++count > threshold)
> + return true;
> + }
> + return false;
> +}
> +
> +/*
> + * Assign a global ASID to the current process, protecting against
> + * races between multiple threads in the process.
> + */
> +static void use_global_asid(struct mm_struct *mm)
> +{
> + guard(raw_spinlock_irqsave)(&global_asid_lock);
> +
> + /* This process is already using broadcast TLB invalidation. */
> + if (mm->context.global_asid)
> + return;
> +
> + /* The last global ASID was consumed while waiting for the lock. */
> + if (!global_asid_available)

I think "global_asid_available > 0” would make more sense.

> + return;
> +
> + /*
> + * The transition from IPI TLB flushing, with a dynamic ASID,
> + * and broadcast TLB flushing, using a global ASID, uses memory
> + * ordering for synchronization.
> + *
> + * While the process has threads still using a dynamic ASID,
> + * TLB invalidation IPIs continue to get sent.
> + *
> + * This code sets asid_transition first, before assigning the
> + * global ASID.
> + *
> + * The TLB flush code will only verify the ASID transition
> + * after it has seen the new global ASID for the process.
> + */
> + WRITE_ONCE(mm->context.asid_transition, true);

I would prefer smp_wmb() and document where the matching smp_rmb()
(or smp_mb) is.

> + WRITE_ONCE(mm->context.global_asid, get_global_asid());
> +
> + global_asid_available--;
> +}
> +
> +/*
> + * Figure out whether to assign a global ASID to a process.
> + * We vary the threshold by how empty or full global ASID space is.
> + * 1/4 full: >= 4 active threads
> + * 1/2 full: >= 8 active threads
> + * 3/4 full: >= 16 active threads
> + * 7/8 full: >= 32 active threads
> + * etc
> + *
> + * This way we should never exhaust the global ASID space, even on very
> + * large systems, and the processes with the largest number of active
> + * threads should be able to use broadcast TLB invalidation.
> + */
> +#define HALFFULL_THRESHOLD 8
> +static bool meets_global_asid_threshold(struct mm_struct *mm)
> +{
> + int avail = global_asid_available;
> + int threshold = HALFFULL_THRESHOLD;
> +
> + if (!avail)
> + return false;
> +
> + if (avail > MAX_ASID_AVAILABLE * 3 / 4) {
> + threshold = HALFFULL_THRESHOLD / 4;
> + } else if (avail > MAX_ASID_AVAILABLE / 2) {
> + threshold = HALFFULL_THRESHOLD / 2;
> + } else if (avail < MAX_ASID_AVAILABLE / 3) {
> + do {
> + avail *= 2;
> + threshold *= 2;
> + } while ((avail + threshold) < MAX_ASID_AVAILABLE / 2);
> + }
> +
> + return mm_active_cpus_exceeds(mm, threshold);
> +}
> +
> +static void consider_global_asid(struct mm_struct *mm)
> +{
> + if (!static_cpu_has(X86_FEATURE_INVLPGB))
> + return;
> +
> + /* Check every once in a while. */
> + if ((current->pid & 0x1f) != (jiffies & 0x1f))
> + return;
> +
> + if (meets_global_asid_threshold(mm))
> + use_global_asid(mm);
> +}
> +
> +static void finish_asid_transition(struct flush_tlb_info *info)
> +{
> + struct mm_struct *mm = info->mm;
> + int bc_asid = mm_global_asid(mm);
> + int cpu;
> +
> + if (!READ_ONCE(mm->context.asid_transition))
> + return;
> +
> + for_each_cpu(cpu, mm_cpumask(mm)) {
> + /*
> + * The remote CPU is context switching. Wait for that to
> + * finish, to catch the unlikely case of it switching to
> + * the target mm with an out of date ASID.
> + */
> + while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
> + cpu_relax();

Although this code should rarely run, it seems bad for a couple of reasons:

1. It is a new busy-wait in a very delicate place. Lockdep is blind to this
change.

2. cpu_tlbstate is supposed to be private for each core - that’s why there
is cpu_tlbstate_shared. But I really think loaded_mm should be kept
private.

Can't we just do one TLB shootdown if
cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids


> +
> + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
> + continue;
> +
> + /*
> + * If at least one CPU is not using the global ASID yet,
> + * send a TLB flush IPI. The IPI should cause stragglers
> + * to transition soon.
> + *
> + * This can race with the CPU switching to another task;
> + * that results in a (harmless) extra IPI.
> + */
> + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
> + flush_tlb_multi(mm_cpumask(info->mm), info);
> + return;
> + }
> + }
> +
> + /* All the CPUs running this process are using the global ASID. */

I guess it’s ordered with the flushes (the flushes must complete first).

> + WRITE_ONCE(mm->context.asid_transition, false);
> +}
> +
> +static void broadcast_tlb_flush(struct flush_tlb_info *info)
> +{
> + bool pmd = info->stride_shift == PMD_SHIFT;
> + unsigned long maxnr = invlpgb_count_max;
> + unsigned long asid = info->mm->context.global_asid;
> + unsigned long addr = info->start;
> + unsigned long nr;
> +
> + /* Flushing multiple pages at once is not supported with 1GB pages. */
> + if (info->stride_shift > PMD_SHIFT)
> + maxnr = 1;
> +
> + /*
> + * TLB flushes with INVLPGB are kicked off asynchronously.
> + * The inc_mm_tlb_gen() guarantees page table updates are done
> + * before these TLB flushes happen.
> + */
> + if (info->end == TLB_FLUSH_ALL) {
> + invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
> + /* Do any CPUs supporting INVLPGB need PTI? */
> + if (static_cpu_has(X86_FEATURE_PTI))
> + invlpgb_flush_single_pcid_nosync(user_pcid(asid));
> + } else do {
> + /*
> + * Calculate how many pages can be flushed at once; if the
> + * remainder of the range is less than one page, flush one.
> + */
> + nr = min(maxnr, (info->end - addr) >> info->stride_shift);
> + nr = max(nr, 1);
> +
> + invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
> + /* Do any CPUs supporting INVLPGB need PTI? */
> + if (static_cpu_has(X86_FEATURE_PTI))
> + invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
> + addr += nr << info->stride_shift;
> + } while (addr < info->end);

I would have preferred for instead of while...

> +
> + finish_asid_transition(info);
> +
> + /* Wait for the INVLPGBs kicked off above to finish. */
> + tlbsync();
> +}
> +#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */
> +
> /*
> * Given an ASID, flush the corresponding user ASID. We can delay this
> * until the next time we switch to it.
> @@ -556,8 +857,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> */
> if (prev == next) {
> /* Not actually switching mm's */
> - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
> - next->context.ctx_id);
> + VM_WARN_ON(is_dyn_asid(prev_asid) &&
> + this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
> + next->context.ctx_id);
>
> /*
> * If this races with another thread that enables lam, 'new_lam'
> @@ -573,6 +875,23 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> !cpumask_test_cpu(cpu, mm_cpumask(next))))
> cpumask_set_cpu(cpu, mm_cpumask(next));
>
> + /*
> + * Check if the current mm is transitioning to a new ASID.
> + */
> + if (needs_global_asid_reload(next, prev_asid)) {
> + next_tlb_gen = atomic64_read(&next->context.tlb_gen);
> +
> + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
> + goto reload_tlb;
> + }
> +
> + /*
> + * Broadcast TLB invalidation keeps this PCID up to date
> + * all the time.
> + */
> + if (is_global_asid(prev_asid))
> + return;
> +
> /*
> * If the CPU is not in lazy TLB mode, we are just switching
> * from one thread in a process to another thread in the same
> @@ -606,6 +925,13 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> */
> cond_mitigation(tsk);
>
> + /*
> + * Let nmi_uaccess_okay() and finish_asid_transition()
> + * know that we're changing CR3.
> + */
> + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
> + barrier();
> +
> /*
> * Leave this CPU in prev's mm_cpumask. Atomic writes to
> * mm_cpumask can be expensive under contention. The CPU
> @@ -620,14 +946,12 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> next_tlb_gen = atomic64_read(&next->context.tlb_gen);
>
> choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
> -
> - /* Let nmi_uaccess_okay() know that we're changing CR3. */
> - this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
> - barrier();
> }
>
> +reload_tlb:
> new_lam = mm_lam_cr3_mask(next);
> if (need_flush) {
> + VM_BUG_ON(is_global_asid(new_asid));
> this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
> this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
> load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
> @@ -746,7 +1070,7 @@ static void flush_tlb_func(void *info)
> const struct flush_tlb_info *f = info;
> struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
> u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> - u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
> + u64 local_tlb_gen;
> bool local = smp_processor_id() == f->initiating_cpu;
> unsigned long nr_invalidate = 0;
> u64 mm_tlb_gen;
> @@ -769,6 +1093,16 @@ static void flush_tlb_func(void *info)
> if (unlikely(loaded_mm == &init_mm))
> return;
>
> + /* Reload the ASID if transitioning into or out of a global ASID */
> + if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) {
> + switch_mm_irqs_off(NULL, loaded_mm, NULL);
> + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> + }
> +
> + /* Broadcast ASIDs are always kept up to date with INVLPGB. */
> + if (is_global_asid(loaded_mm_asid))
> + return;
> +
> VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
> loaded_mm->context.ctx_id);
>
> @@ -786,6 +1120,8 @@ static void flush_tlb_func(void *info)
> return;
> }
>
> + local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
> +
> if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
> f->new_tlb_gen <= local_tlb_gen)) {
> /*
> @@ -953,7 +1289,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
> * up on the new contents of what used to be page tables, while
> * doing a speculative memory access.
> */
> - if (info->freed_tables)
> + if (info->freed_tables || in_asid_transition(info))
> on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
> else
> on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
> @@ -1049,9 +1385,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
> * a local TLB flush is needed. Optimize this use-case by calling
> * flush_tlb_func_local() directly in this case.
> */
> - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {

I think an smp_rmb() here would communicate the fact in_asid_transition() and
mm_global_asid() must be ordered.

> + if (mm_global_asid(mm)) {
> + broadcast_tlb_flush(info);
> + } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
> info->trim_cpumask = should_trim_cpumask(mm);
> flush_tlb_multi(mm_cpumask(mm), info);
> + consider_global_asid(mm);
> } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
> lockdep_assert_irqs_enabled();
> local_irq_disable();
> --
> 2.47.1
>