[RFC PATCH v2 1/2] x86/perf/amd: Resolve race condition when disabling PMC

From: Lendacky, Thomas
Date: Fri Mar 15 2019 - 16:40:58 EST


On AMD processors, the detection of an overflowed counter in the NMI
handler relies on the current value of the counter. So, for example, to
check for overflow on a 48 bit counter, bit 47 is checked to see if it
is 1 (not overflowed) or 0 (overflowed).

There is currently a race condition present when disabling and then
updating the PMC. Increased NMI latency in newer AMD processors makes this
race condition more pronounced. If the counter value has overflowed, it is
possible to update the PMC value before the NMI handler can run. The
updated PMC value is not an overflowed value, so when the perf NMI handler
does run, it will not find an overflowed counter. This may appear as an
unknown NMI resulting in either a panic or a series of messages, depending
on how the kernel is configured.

To eliminate this race condition, the PMC value must be checked after
disabling the counter. Add an AMD function, amd_pmu_disable_all(), to be
used in place of the common x86_pmu_disable_all() that, after disabling
the counters, will wait for the NMI handler to reset any active and
enabled overflowed counter.

Cc: <stable@xxxxxxxxxxxxxxx> # 4.14.x-
Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
---
arch/x86/events/amd/core.c | 85 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 7d2d7c801dba..732402dcffdc 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -3,6 +3,7 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/delay.h>
#include <asm/apicdef.h>

#include "../perf_event.h"
@@ -429,6 +430,84 @@ static void amd_pmu_cpu_dead(int cpu)
}
}

+/*
+ * When a PMC counter overflows, an NMI is used to process the event and
+ * reset the counter. NMI latency can result in the counter being updated
+ * before the NMI can run, which can result in what appear to be spurious
+ * NMIs. This function is intended to wait for the NMI to run and reset
+ * the counter to avoid possible unhandled NMI messages.
+ */
+#define OVERFLOW_WAIT_COUNT 50
+static void amd_pmu_wait_on_overflow(int idx)
+{
+ unsigned int i;
+ u64 counter;
+
+ /*
+ * Wait for the counter to be reset if it has overflowed. This loop
+ * should exit very, very quickly, but just in case, don't wait
+ * forever...
+ */
+ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
+ rdmsrl(x86_pmu_event_addr(idx), counter);
+ if (counter & (1ULL << (x86_pmu.cntval_bits - 1)))
+ break;
+
+ /* Might be in IRQ context, so can't sleep */
+ udelay(1);
+ }
+}
+
+void amd_pmu_disable_all(void)
+{
+ unsigned long overflow_check[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ bitmap_zero(overflow_check, X86_PMC_IDX_MAX);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ rdmsrl(x86_pmu_config_addr(idx), val);
+ if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+ continue;
+
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(x86_pmu_config_addr(idx), val);
+
+ /*
+ * If the interrupt is enabled, this counter must be checked
+ * for an overflow condition to avoid possibly changing the
+ * counter value before the NMI handler runs.
+ */
+ if (val & ARCH_PERFMON_EVENTSEL_INT)
+ __set_bit(idx, overflow_check);
+ }
+
+ /*
+ * This shouldn't be called from NMI context, but add a safeguard here
+ * to return, since if we're in NMI context we can't wait for an NMI
+ * to reset an overflowed counter value.
+ */
+ if (in_nmi())
+ return;
+
+ /*
+ * Check each counter for overflow and wait for it to be reset by the
+ * NMI if it has overflowed.
+ */
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, overflow_check))
+ continue;
+
+ amd_pmu_wait_on_overflow(idx);
+ }
+}
+
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -622,7 +701,7 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config)
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
+ .disable_all = amd_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
@@ -732,7 +811,7 @@ void amd_pmu_enable_virt(void)
cpuc->perf_ctr_virt_mask = 0;

/* Reload all events */
- x86_pmu_disable_all();
+ amd_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
@@ -750,7 +829,7 @@ void amd_pmu_disable_virt(void)
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;

/* Reload all events */
- x86_pmu_disable_all();
+ amd_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);