[PATCH RFC v4] x86,mm,sched: make lazy TLB mode even lazier
From: Rik van Riel
Date: Tue Aug 30 2016 - 15:53:47 EST
On Sat, 27 Aug 2016 16:02:25 -0700
Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
> The only remaining comment is that I'd make that
> lazy_tlb_can_skip_flush() function just use a switch table for the
> tlbstate comparisons rather than the repeated conditionals.
After staring at the code for an hour or so yesterday, I found a race
condition. It took me a few minutes to realize we can fix it with a
cmpxchg at context switch time, and then most of a day to realize that
we only need that cmpxchg in the context switch code if the old tlb
state is TLBSTATE_LAZY.
Context switch times when the tlb state is something else should be
unaffected.
The 4th version of the patch (below) closes that race condition, and
includes the improvements suggested by Ingo and you.
> I'd love to see the results from Benjamin - maybe it helps a lot, and
> maybe it doesn't. But regardless, the patch makes sense to me.
I would love to see test results from Ben, as well. Ben? :)
---8<---
Subject: x86,mm,sched: make lazy TLB mode even lazier
Lazy TLB mode can result in an idle CPU being woken up for a TLB
flush, when all it really needed to do was flush %CR3 before the
next context switch.
This is mostly fine on bare metal, though sub-optimal from a power
saving point of view, and deeper C-states could make TLB flushes
take a little longer than desired.
On virtual machines, the pain can be much worse, especially if a
currently non-running VCPU is woken up for a TLB invalidation
IPI, on a CPU that is busy running another task. It could take
a while before that IPI is handled, leading to performance issues.
This patch deals with the issue by introducing a third TLB state,
TLBSTATE_FLUSH, which causes %CR3 to be flushed at the next
context switch.
A CPU that transitions from TLBSTATE_LAZY to TLBSTATE_OK during
the attempted transition to TLBSTATE_FLUSH will get a TLB flush
IPI, just like a CPU that was in TLBSTATE_OK to begin with.
Nothing is done for a CPU that is already in TLBSTATE_FLUSH mode.
Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>
Reported-by: Benjamin Serebrin <serebrin@xxxxxxxxxx>
---
arch/x86/include/asm/tlbflush.h | 3 +-
arch/x86/include/asm/uv/uv.h | 6 ++--
arch/x86/mm/tlb.c | 64 ++++++++++++++++++++++++++++++++++++++---
arch/x86/platform/uv/tlb_uv.c | 2 +-
4 files changed, 66 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4e5be94e079a..c3dbacbc49be 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -304,12 +304,13 @@ extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
#define flush_tlb() flush_tlb_current_task()
-void native_flush_tlb_others(const struct cpumask *cpumask,
+void native_flush_tlb_others(struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start, unsigned long end);
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
+#define TLBSTATE_FLUSH 3
static inline void reset_lazy_tlbstate(void)
{
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 062921ef34e9..7e83cc633ba1 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -13,7 +13,7 @@ extern int is_uv_system(void);
extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
-extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+extern struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end,
@@ -25,8 +25,8 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
static inline int is_uv_system(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
+static inline struct cpumask *
+uv_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm,
unsigned long start, unsigned long end, unsigned int cpu)
{ return cpumask; }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5643fd0b1a7d..634248b38db9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,10 +140,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
#ifdef CONFIG_SMP
else {
+ int *tlbstate = this_cpu_ptr(&cpu_tlbstate.state);
+ int oldstate = *tlbstate;
+
+ if (unlikely(oldstate == TLBSTATE_LAZY)) {
+ /*
+ * The TLB flush code (lazy_tlb_can_skip_flush) can
+ * move the TLB state to TLBSTATE_FLUSH concurrently
+ * with a context switch. Using cmpxchg here will catch
+ * that transition, causing a TLB flush below.
+ */
+ oldstate = cmpxchg(tlbstate, oldstate, TLBSTATE_OK);
+ }
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ if (oldstate == TLBSTATE_FLUSH ||
+ !cpumask_test_cpu(cpu, mm_cpumask(next))) {
/*
* On established mms, the mm_cpumask is only changed
* from irq context, from ptep_clear_flush() while in
@@ -242,11 +256,44 @@ static void flush_tlb_func(void *info)
}
-void native_flush_tlb_others(const struct cpumask *cpumask,
+/*
+ * Determine whether a CPU's TLB needs to be flushed now, or whether the
+ * flush can be delayed until the next context switch, by changing the
+ * tlbstate from TLBSTATE_LAZY to TLBSTATE_FLUSH.
+ */
+static bool lazy_tlb_can_skip_flush(int cpu)
+{
+ int *tlbstate = &per_cpu(cpu_tlbstate.state, cpu);
+ int old;
+
+ switch (*tlbstate) {
+ case TLBSTATE_FLUSH:
+ /* The TLB will be flushed on the next context switch. */
+ return true;
+ case TLBSTATE_LAZY:
+ /*
+ * The CPU is in TLBSTATE_LAZY, which could context switch back
+ * to TLBSTATE_OK, re-using the old TLB state without a flush.
+ * If that happened, send a TLB flush IPI.
+ *
+ * Otherwise, the state is now TLBSTATE_FLUSH, and TLB will
+ * be flushed at the next context switch. Skip the IPI.
+ */
+ old = cmpxchg(tlbstate, TLBSTATE_LAZY, TLBSTATE_FLUSH);
+ return old != TLBSTATE_OK;
+ case TLBSTATE_OK:
+ default:
+ /* A task on the CPU is actively using the mm. Flush the TLB. */
+ return false;
+ }
+}
+
+void native_flush_tlb_others(struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
unsigned long end)
{
struct flush_tlb_info info;
+ unsigned int cpu;
if (end == 0)
end = start + PAGE_SIZE;
@@ -262,8 +309,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(end - start) >> PAGE_SHIFT);
if (is_uv_system()) {
- unsigned int cpu;
-
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
if (cpumask)
@@ -271,6 +316,17 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
&info, 1);
return;
}
+
+ /*
+ * Instead of sending IPIs to CPUs in lazy TLB mode, move that
+ * CPU's TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed
+ * at the next context switch.
+ */
+ for_each_cpu(cpu, cpumask) {
+ if (lazy_tlb_can_skip_flush(cpu))
+ cpumask_clear_cpu(cpu, cpumask);
+ }
+
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index fdb4d42b4ce5..7a2221a81e77 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1090,7 +1090,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* Returns pointer to cpumask if some remote flushing remains to be
* done. The returned pointer is valid till preemption is re-enabled.
*/
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end,