Re: [PATCH 12/21] perf/x86/intel/lbr: Support Architectural LBR

From: Liang, Kan
Date: Fri Jun 19 2020 - 15:41:01 EST




On 6/19/2020 3:08 PM, Peter Zijlstra wrote:
On Fri, Jun 19, 2020 at 07:04:00AM -0700, kan.liang@xxxxxxxxxxxxxxx wrote:

+static void intel_pmu_arch_lbr_enable(bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_ctl = 0, orig_debugctl;
+
+ if (pmi)
+ return;
+
+ if (cpuc->lbr_ctl)
+ lbr_ctl = cpuc->lbr_ctl->config & x86_pmu.lbr_ctl_mask;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may be missed, that can lead to confusing results.
+ */
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+ if (lbr_ctl & ARCH_LBR_CALL_STACK)
+ debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ else
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (orig_debugctl != debugctl)
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+
+ wrmsrl(MSR_ARCH_LBR_CTL, lbr_ctl | ARCH_LBR_CTL_LBREN);
+}

This is nearly a duplicate of the old one, surely we can do better?

It's similar, but we have to deal with different MSRs and bits.


+static void intel_pmu_arch_lbr_restore(void *ctx)
+{
+ struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+ struct x86_perf_arch_lbr_entry *entries = task_ctx->entries;
+ int i;
+
+ /* Fast reset the LBRs before restore if the call stack is not full. */
+ if (!entries[x86_pmu.lbr_nr - 1].lbr_from)
+ intel_pmu_arch_lbr_reset();
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!entries[i].lbr_from)
+ break;
+ wrlbr_from(i, entries[i].lbr_from);
+ wrlbr_to(i, entries[i].lbr_to);
+ wrmsrl(MSR_ARCH_LBR_INFO_0 + i, entries[i].lbr_info);
+ }
+}

This too looks very much like the old one.

The difference is the reset part.
For the previous platforms, we restore the saved LBRs first, then reset the unsaved LBR MSRs one by one.
Now, for Arch LBR, we have a fast reset method. We reset all LBRs (if we know there are unsaved items) first, then restore the saved LBRs.
That would improve the performance for the application with short call stack.


+static void intel_pmu_arch_lbr_save(void *ctx)
+{
+ struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+ struct x86_perf_arch_lbr_entry *entries = task_ctx->entries;
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ entries[i].lbr_from = rdlbr_from(i);
+ /* Only save valid branches. */
+ if (!entries[i].lbr_from)
+ break;
+ entries[i].lbr_to = rdlbr_to(i);
+ rdmsrl(MSR_ARCH_LBR_INFO_0 + i, entries[i].lbr_info);
+ }
+
+ /* LBR call stack is not full. Reset is required in restore. */
+ if (i < x86_pmu.lbr_nr)
+ entries[x86_pmu.lbr_nr - 1].lbr_from = 0;
+}

And again..

+static void __intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc, int index,
+ u64 from, u64 to, u64 info)
+{
+ u64 mis = 0, pred = 0, in_tx = 0, abort = 0, type = 0;
+ u32 br_type, to_plm;
+ u16 cycles = 0;
+
+ if (x86_pmu.arch_lbr_mispred) {
+ mis = !!(info & ARCH_LBR_INFO_MISPRED);
+ pred = !mis;
+ }
+ in_tx = !!(info & ARCH_LBR_INFO_IN_TSX);
+ abort = !!(info & ARCH_LBR_INFO_TSX_ABORT);
+ if (x86_pmu.arch_lbr_timed_lbr &&
+ (info & ARCH_LBR_INFO_CYC_CNT_VALID))
+ cycles = (info & ARCH_LBR_INFO_CYC_CNT);
+
+ /*
+ * Parse the branch type recorded in LBR_x_INFO MSR.
+ * Doesn't support OTHER_BRANCH decoding for now.
+ * OTHER_BRANCH branch type still rely on software decoding.
+ */
+ if (x86_pmu.arch_lbr_br_type) {
+ br_type = (info & ARCH_LBR_INFO_BR_TYPE) >> ARCH_LBR_INFO_BR_TYPE_OFFSET;
+
+ if (br_type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ type = arch_lbr_br_type_map[br_type] | to_plm;
+ }
+ }
+
+ cpuc->lbr_entries[index].from = from;
+ cpuc->lbr_entries[index].to = to;
+ cpuc->lbr_entries[index].mispred = mis;
+ cpuc->lbr_entries[index].predicted = pred;
+ cpuc->lbr_entries[index].in_tx = in_tx;
+ cpuc->lbr_entries[index].abort = abort;
+ cpuc->lbr_entries[index].cycles = cycles;
+ cpuc->lbr_entries[index].type = type;
+ cpuc->lbr_entries[index].reserved = 0;
+}
+
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+{
+ u64 from, to, info;
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ from = rdlbr_from(i);
+ to = rdlbr_to(i);
+
+ /*
+ * Read LBR entries until invalid entry (0s) is detected.
+ */
+ if (!from)
+ break;
+
+ rdmsrl(MSR_ARCH_LBR_INFO_0 + i, info);
+
+ __intel_pmu_arch_lbr_read(cpuc, i, from, to, info);
+ }
+
+ cpuc->lbr_stack.nr = i;
+}

+static void intel_pmu_store_pebs_arch_lbrs(struct pebs_lbr *pebs_lbr,
+ struct cpu_hw_events *cpuc)
+{
+ struct pebs_lbr_entry *lbr;
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr = &pebs_lbr->lbr[i];
+
+ /*
+ * Read LBR entries until invalid entry (0s) is detected.
+ */
+ if (!lbr->from)
+ break;
+
+ __intel_pmu_arch_lbr_read(cpuc, i, lbr->from,
+ lbr->to, lbr->info);
+ }
+
+ cpuc->lbr_stack.nr = i;
+ intel_pmu_lbr_filter(cpuc);
+}

Unless I'm reading cross-eyed again, that too is very similar to what we
already had.


I will try to factor out the common codes for thses functions as many as possible.


Thanks,
Kan