[RFC PATCH 2/2] x86/perf/amd: Resolve NMI latency issues when multiple PMCs are active

From: Lendacky, Thomas
Date: Mon Mar 11 2019 - 12:48:56 EST


On AMD processors, the detection of an overflowed PMC counter in the NMI
handler relies on the current value of the PMC. So, for example, to check
for overflow on a 48-bit counter, bit 47 is checked to see if it is 1 (not
overflowed) or 0 (overflowed).

When the perf NMI handler executes it does not know in advance which PMC
counters have overflowed. As such, the NMI handler will process all active
PMC counters that have overflowed. NMI latency in newer AMD processors can
result in multiple overflowed PMC counters being processed in one NMI and
then a subsequent NMI, that does not appear to be a back-to-back NMI, not
finding any PMC counters that have overflowed. This may appear to be an
unhandled NMI resulting in either a panic or a series of messages,
depending on how the kernel was configured.

To mitigate this issue, a new, optional, callable function is introduced
that is called before returning from x86_pmu_handle_irq(). The AMD perf
support will use this function to indicate if the NMI has been handled or
would have been handled had an earlier NMI not handled the overflowed PMC.
Using a per-CPU variable, a minimum value of one less than the number of
active PMCs or 2 will be set when any PMC overflow has been handled and
more than one PMC is active. This is used to indicate the possible number
of NMIs that can still occur. The value of 2 is used for when an NMI does
not arrive at the APIC in time to be collapsed into an already pending
NMI. Each time the function is called without having handled an overflowed
counter, the per-CPU value is checked. If the value is non-zero, it is
decremented and the NMI indicates that it handled the NMI. If the value is
zero, then the NMI indicates that it did not handle the NMI.

Cc: <stable@xxxxxxxxxxxxxxx> # 4.14.x-
Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
---
arch/x86/events/amd/core.c | 43 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/events/core.c | 6 ++++++
arch/x86/events/perf_event.h | 2 ++
3 files changed, 51 insertions(+)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index d989640fa87d..30457b511e6d 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -5,9 +5,12 @@
#include <linux/slab.h>
#include <linux/delay.h>
#include <asm/apicdef.h>
+#include <asm/nmi.h>

#include "../perf_event.h"

+static DEFINE_PER_CPU(unsigned int, perf_nmi_counter);
+
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -467,6 +470,45 @@ static void amd_pmu_wait_on_overflow(int idx, u64 config)
}
}

+/*
+ * Because of NMI latency, if multiple PMC counters are active we need to take
+ * into account that multiple PMC overflows can generate multiple NMIs but be
+ * handled by a single invocation of the NMI handler (think PMC overflow while
+ * in the NMI handler). This could result in subsequent unknown NMI messages
+ * being issued.
+ *
+ * Attempt to mitigate this by using the number of active PMCs to determine
+ * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset
+ * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of one
+ * less than the number of active PMCs or 2. The value of 2 is used in case the
+ * NMI does not arrive at the APIC in time to be collapsed into an already
+ * pending NMI.
+ */
+static int amd_pmu_mitigate_nmi_latency(unsigned int active, int handled)
+{
+ /* If multiple counters are not active return original handled count */
+ if (active <= 1)
+ return handled;
+
+ /*
+ * If a counter was handled, record the number of possible remaining
+ * NMIs that can occur.
+ */
+ if (handled) {
+ this_cpu_write(perf_nmi_counter,
+ min_t(unsigned int, 2, active - 1));
+
+ return handled;
+ }
+
+ if (!this_cpu_read(perf_nmi_counter))
+ return NMI_DONE;
+
+ this_cpu_dec(perf_nmi_counter);
+
+ return NMI_HANDLED;
+}
+
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -689,6 +731,7 @@ static __initconst const struct x86_pmu amd_pmu = {

.amd_nb_constraints = 1,
.wait_on_overflow = amd_pmu_wait_on_overflow,
+ .mitigate_nmi_latency = amd_pmu_mitigate_nmi_latency,
};

static int __init amd_core_pmu_init(void)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f1d2f70000cd..a59c3fcbae6a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1434,6 +1434,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
struct perf_event *event;
+ unsigned int active = 0;
int idx, handled = 0;
u64 val;

@@ -1461,6 +1462,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
continue;
}

+ active++;
+
event = cpuc->events[idx];

val = x86_perf_event_update(event);
@@ -1483,6 +1486,9 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
if (handled)
inc_irq_stat(apic_perf_irqs);

+ if (x86_pmu.mitigate_nmi_latency)
+ handled = x86_pmu.mitigate_nmi_latency(active, handled);
+
return handled;
}

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a37490a26a09..619214bda92e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -637,6 +637,8 @@ struct x86_pmu {
*/
unsigned int amd_nb_constraints : 1;
void (*wait_on_overflow)(int idx, u64 config);
+ int (*mitigate_nmi_latency)(unsigned int active,
+ int handled);

/*
* Extra registers for events