[PATCH 3/4] perf, x86: Use counter freezing with Arch Perfmon v4

From: Andi Kleen
Date: Thu Oct 15 2015 - 19:39:30 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

Now that we have reliable LBR unfreezing we can also use
real counter freezing.

Arch Perfmon v4 has an improved counter freezing implementation.

With counter freezing the PMU automatically "freezes" all counters
on a counter overflow, so that the PMI handler does not need to
explicitly disable the counter.

With arch perfmon 4 the freeze bits are explicit bits in GLOBAL_STATUS
now, which avoids a range of races in the previous implementation,
and also allows avoiding an extra write to GLOBAL_CTRL.

Advantages of counter freezes:
- It avoids a couple of costly extra MSR writes in the PMI handler
- It makes the PMI handler more accurate, as all counters get
frozen atomically as soon as any counter overflows. So there is
much less counting of the PMI handler itself.

With the freezing we don't need to disable or enable counters or PEBS. Only
BTS which does not support auto-freezing still needs to be explicitly
disabled.

The "status counter ack" serves as the reenable action, by clearing
the freeze bits in the status register.

So previously for a PEBS counter the PMI would do (each line a MSR access)

disable global ctrl
disable pebs
read status
ack status
read status again
reenable global ctrl
reenable pebs

(5x WRMSR, 2x RDMSR)

With the new counter freezing support this is simplified to:

read status
ack

(1x WRMSR, 1x RDMSR)

The counter freezing code is only used when the CPU model
opted into the new style status ack after apic sequence. So it's currently
only used on Skylake.

One issue is that the hardware doesn't like changing the period in
freeze mode. To avoid any issues here we only use counter freezing
when all counters are not in frequency mode, but have a fixed period.
This is kept track of with new per CPU state.

In frequency mode the old mode is still used.

Performance:

When profiling a kernel build on Skylake with different perf options,
measuring the length of all NMI handlers using the nmi handler trace point:

(lower is better)

perf options ` avg max delta
- 962 37248
-c 100000 445 31217 -53% with counter freezing
-g 1753 47312
-g -c 100000 966 33698 -44% with counter freezing
--call-graph lbr 3770 37164
--call-graph lbr -c 100000 2433 36930 -35% with counter freezing
--c.g. dwarf 2055 33598
--c.g. dwarf -c 100000 1478 30491 -28% with counter freezing

So the average cost of a NMI handler is cut down significantly with
freezing.

At least on this workload this makes -g competitive with the previous
non -g.

The max cost isn't really improved, since that is dominated by
other overhead.

Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/kernel/cpu/perf_event.h | 5 +++
arch/x86/kernel/cpu/perf_event_intel.c | 79 +++++++++++++++++++++++++++++++++-
3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 54390bc..a527e77 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -140,6 +140,7 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)

#define MSR_PEBS_FRONTEND 0x000003f7

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index fcf01c7..2cbba8c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -240,6 +240,11 @@ struct cpu_hw_events {
int excl_thread_id; /* 0 or 1 */

/*
+ * Counter freezing state.
+ */
+ int frozen_enabled;
+
+ /*
* AMD specific bits
*/
struct amd_nb *amd_nb;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a466055..4c46cd7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1596,6 +1596,25 @@ static void intel_pmu_nhm_enable_all(int added)
intel_pmu_enable_all(added);
}

+static inline bool event_can_freeze(struct perf_event *event)
+{
+ if (!x86_pmu.status_ack_after_apic)
+ return false;
+ return !event->attr.freq;
+}
+
+static void enable_counter_freeze(void)
+{
+ update_debugctlmsr(get_debugctlmsr() |
+ DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
+static void disable_counter_freeze(void)
+{
+ update_debugctlmsr(get_debugctlmsr() &
+ ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
+}
+
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -1649,6 +1668,14 @@ static void intel_pmu_disable_event(struct perf_event *event)
if (needs_branch_stack(event))
intel_pmu_lbr_disable(event);

+ /*
+ * We could disable freezing here, but doesn't hurt if it's on.
+ * perf remembers the state, and someone else will likely
+ * reinitialize.
+ *
+ * This avoids an extra MSR write in many situations.
+ */
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
return;
@@ -1715,6 +1742,26 @@ static void intel_pmu_enable_event(struct perf_event *event)
if (event->attr.exclude_guest)
cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);

+ if (x86_pmu.version >= 4) {
+ /*
+ * Enable freezing if this event is suitable for freezing,
+ * and no other event is in frequency mode.
+ * Otherwise disable freezing for everyone.
+ */
+ if (event_can_freeze(event) && event->ctx->nr_freq == 0) {
+ if (!cpuc->frozen_enabled) {
+ enable_counter_freeze();
+ cpuc->frozen_enabled = 1;
+ }
+ } else if (cpuc->frozen_enabled) {
+ /* Disable freezing if it's on */
+ intel_pmu_disable_all();
+ cpuc->frozen_enabled = 0;
+ disable_counter_freeze();
+ intel_pmu_enable_all(0);
+ }
+ }
+
if (unlikely(event_is_checkpointed(event)))
cpuc->intel_cp_status |= (1ull << hwc->idx);

@@ -1800,16 +1847,29 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
u64 orig_status;
int handled;
+ bool freeze;

cpuc = this_cpu_ptr(&cpu_hw_events);

/*
+ * With counter freezing the CPU freezes counters on PMI.
+ * This makes measurements more accurate and generally has
+ * lower overhead, as we need to change less registers.
+ *
+ * We only freeze when all events are in fixed period mode.
+ */
+ freeze = cpuc->frozen_enabled > 0;
+
+ /*
* No known reason to not always do late ACK,
* but just in case do it opt-in.
*/
if (!x86_pmu.late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
- __intel_pmu_disable_all();
+ if (!freeze)
+ __intel_pmu_disable_all();
+ else
+ intel_pmu_maybe_disable_bts();
handled = intel_pmu_drain_bts_buffer();
handled += intel_bts_interrupt();
status = intel_pmu_get_status();
@@ -1918,7 +1978,10 @@ done:
*/
if (x86_pmu.status_ack_after_apic) {
intel_pmu_ack_status(orig_status);
- __intel_pmu_enable_all(0, true);
+ if (!freeze)
+ __intel_pmu_enable_all(0, true);
+ else
+ intel_pmu_maybe_enable_bts();
}
return handled;
}
@@ -2908,6 +2971,11 @@ static void intel_pmu_cpu_dying(int cpu)
free_excl_cntrs(cpu);

fini_debug_store_on_cpu(cpu);
+
+ if (cpuc->frozen_enabled) {
+ cpuc->frozen_enabled = 0;
+ disable_counter_freeze();
+ }
}

static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3646,6 +3714,13 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}

+ /*
+ * For arch perfmon 4 use counter freezing to avoid
+ * several MSR accesses in the PMI.
+ */
+ if (x86_pmu.version >= 4 && x86_pmu.status_ack_after_apic)
+ pr_cont("counter freezing, ");
+
return 0;
}

--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/