Re: [PATCH 07/10] x86,mm: enable broadcast TLB invalidation for multi-threaded processes

From: Peter Zijlstra
Date: Sun Dec 22 2024 - 06:36:22 EST

Next message: Peter Zijlstra: "Re: [PATCH 09/10] x86/mm: enable AMD translation cache extensions"
Previous message: kernel test robot: "drivers/gpu/drm/scheduler/sched_main.c:266: warning: Function parameter or struct member 'result' not described in 'drm_sched_job_done'"
In reply to: Rik van Riel: "[PATCH 07/10] x86,mm: enable broadcast TLB invalidation for multi-threaded processes"
Next in thread: Rik van Riel: "Re: [PATCH 07/10] x86,mm: enable broadcast TLB invalidation for multi-threaded processes"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Sat, Dec 21, 2024 at 11:06:39PM -0500, Rik van Riel wrote:

> +#ifdef CONFIG_CPU_SUP_AMD
> +/*
> + * Logic for AMD INVLPGB support.
> + */
> +static DEFINE_SPINLOCK(broadcast_asid_lock);

RAW_SPINLOCK ?

> +static u16 last_broadcast_asid = TLB_NR_DYN_ASIDS;
> +static DECLARE_BITMAP(broadcast_asid_used, MAX_ASID_AVAILABLE) = { 0 };
> +static LIST_HEAD(broadcast_asid_list);
> +static int broadcast_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
> +
> +static void reset_broadcast_asid_space(void)
> +{
> + mm_context_t *context;
> +
> + assert_spin_locked(&broadcast_asid_lock);

lockdep_assert_locked(&broadcast_asid_lock);

> +
> + /*
> + * Flush once when we wrap around the ASID space, so we won't need
> + * to flush every time we allocate an ASID for boradcast flushing.
> + */
> + invlpgb_flush_all_nonglobals();
> + tlbsync();
> +
> + /*
> + * Leave the currently used broadcast ASIDs set in the bitmap, since
> + * those cannot be reused before the next wraparound and flush..
> + */
> + bitmap_clear(broadcast_asid_used, 0, MAX_ASID_AVAILABLE);
> + list_for_each_entry(context, &broadcast_asid_list, broadcast_asid_list)
> + __set_bit(context->broadcast_asid, broadcast_asid_used);
> +
> + last_broadcast_asid = TLB_NR_DYN_ASIDS;
> +}
> +
> +static u16 get_broadcast_asid(void)
> +{
> + assert_spin_locked(&broadcast_asid_lock);

lockdep_assert_locked

> +
> + do {
> + u16 start = last_broadcast_asid;
> + u16 asid = find_next_zero_bit(broadcast_asid_used, MAX_ASID_AVAILABLE, start);
> +
> + if (asid >= MAX_ASID_AVAILABLE) {
> + reset_broadcast_asid_space();
> + continue;
> + }
> +
> + /* Try claiming this broadcast ASID. */
> + if (!test_and_set_bit(asid, broadcast_asid_used)) {
> + last_broadcast_asid = asid;
> + return asid;
> + }
> + } while (1);
> +}
> +
> +/*
> + * Returns true if the mm is transitioning from a CPU-local ASID to a broadcast
> + * (INVLPGB) ASID, or the other way around.
> + */
> +static bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
> +{
> + u16 broadcast_asid = next->context.broadcast_asid;
> +
> + if (broadcast_asid && prev_asid != broadcast_asid) {
> + return true;
> + }
> +
> + if (!broadcast_asid && is_broadcast_asid(prev_asid)) {
> + return true;
> + }
> +
> + return false;
> +}

Those return statements don't really need {} on.

> +
> +void destroy_context_free_broadcast_asid(struct mm_struct *mm) {

{ goes on a new line.

> + unsigned long flags;
> +
> + if (!mm->context.broadcast_asid)
> + return;
> +
> + spin_lock_irqsave(&broadcast_asid_lock, flags);

guard(raw_spin_lock_irqsave)(&broadcast_asid_lock);

> + mm->context.broadcast_asid = 0;
> + list_del(&mm->context.broadcast_asid_list);
> + broadcast_asid_available++;
> + spin_unlock_irqrestore(&broadcast_asid_lock, flags);
> +}
> +
> +static int mm_active_cpus(struct mm_struct *mm)
> +{
> + int count = 0;
> + int cpu;
> +
> + for_each_cpu(cpu, mm_cpumask(mm)) {
> + /* Skip the CPUs that aren't really running this process. */
> + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
> + continue;
> +
> + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
> + continue;
> +
> + count++;
> + }
> + return count;
> +}
> +
> +/*
> + * Assign a broadcast ASID to the current process, protecting against
> + * races between multiple threads in the process.
> + */
> +static void use_broadcast_asid(struct mm_struct *mm)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&broadcast_asid_lock, flags);

guard(raw_spin_lock_irqsave)(&broadcast_asid_lock);

> +
> + /* This process is already using broadcast TLB invalidation. */
> + if (mm->context.broadcast_asid)
> + goto out_unlock;

return;

> + mm->context.broadcast_asid = get_broadcast_asid();
> + mm->context.asid_transition = true;
> + list_add(&mm->context.broadcast_asid_list, &broadcast_asid_list);
> + broadcast_asid_available--;
> +
> +out_unlock:

Notably we're really wanting to get away from the whole goto unlock
pattern.

> + spin_unlock_irqrestore(&broadcast_asid_lock, flags);
> +}
> +
> +/*
> + * Figure out whether to assign a broadcast (global) ASID to a process.
> + * We vary the threshold by how empty or full broadcast ASID space is.
> + * 1/4 full: >= 4 active threads
> + * 1/2 full: >= 8 active threads
> + * 3/4 full: >= 16 active threads
> + * 7/8 full: >= 32 active threads
> + * etc
> + *
> + * This way we should never exhaust the broadcast ASID space, even on very
> + * large systems, and the processes with the largest number of active
> + * threads should be able to use broadcast TLB invalidation.

I'm a little confused, at most we need one ASID per CPU, IIRC we have
something like 4k ASIDs (page-offset bits in the physical address bits)
so for anything with less than 4K CPUs we're good, but with anything
having more CPUs we're up a creek irrespective of the above scheme, no?

> + */
> +#define HALFFULL_THRESHOLD 8
> +static bool meets_broadcast_asid_threshold(struct mm_struct *mm)
> +{
> + int avail = broadcast_asid_available;
> + int threshold = HALFFULL_THRESHOLD;
> + int mm_active_threads;
> +
> + if (!avail)
> + return false;
> +
> + mm_active_threads = mm_active_cpus(mm);
> +
> + /* Small processes can just use IPI TLB flushing. */
> + if (mm_active_threads < 3)
> + return false;
> +
> + if (avail > MAX_ASID_AVAILABLE * 3 / 4) {
> + threshold = HALFFULL_THRESHOLD / 4;
> + } else if (avail > MAX_ASID_AVAILABLE / 2) {
> + threshold = HALFFULL_THRESHOLD / 2;
> + } else if (avail < MAX_ASID_AVAILABLE / 3) {
> + do {
> + avail *= 2;
> + threshold *= 2;
> + } while ((avail + threshold ) < MAX_ASID_AVAILABLE / 2);
> + }
> +
> + return mm_active_threads > threshold;
> +}

Next message: Peter Zijlstra: "Re: [PATCH 09/10] x86/mm: enable AMD translation cache extensions"
Previous message: kernel test robot: "drivers/gpu/drm/scheduler/sched_main.c:266: warning: Function parameter or struct member 'result' not described in 'drm_sched_job_done'"
In reply to: Rik van Riel: "[PATCH 07/10] x86,mm: enable broadcast TLB invalidation for multi-threaded processes"
Next in thread: Rik van Riel: "Re: [PATCH 07/10] x86,mm: enable broadcast TLB invalidation for multi-threaded processes"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]