Re: [PATCH v5 bpf-next 1/3] perf: enable branch record for software events

From: Peter Zijlstra
Date: Fri Sep 03 2021 - 04:04:15 EST


On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote:
> +static int
> +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
> +{
> + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +
> + intel_pmu_disable_all();
> + intel_pmu_lbr_read();
> + cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
> +
> + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
> + intel_pmu_enable_all(0);
> + return cnt;
> +}

Would something like the below help get rid of that memcpy() ?

(compile tested only)

---
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7011e87be6d0..c508ba6099d2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2938,7 +2938,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)

loops = 0;
again:
- intel_pmu_lbr_read();
+ intel_pmu_lbr_read(&cpuc->lbr_stack);
intel_pmu_ack_status(status);
if (++loops > 100) {
static bool warned;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 9e6d6eaeb4cb..0a5bacba86c2 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -170,7 +170,7 @@ enum {

#define ARCH_LBR_CTL_MASK 0x7f000e

-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);

static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
{
@@ -783,7 +783,7 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}

-void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
u64 tos = intel_pmu_lbr_tos();
@@ -801,18 +801,18 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)

rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);

- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].mispred = 0;
- cpuc->lbr_entries[i].predicted = 0;
- cpuc->lbr_entries[i].in_tx = 0;
- cpuc->lbr_entries[i].abort = 0;
- cpuc->lbr_entries[i].cycles = 0;
- cpuc->lbr_entries[i].type = 0;
- cpuc->lbr_entries[i].reserved = 0;
+ bs->entries[i].from = msr_lastbranch.from;
+ bs->entries[i].to = msr_lastbranch.to;
+ bs->entries[i].mispred = 0;
+ bs->entries[i].predicted= 0;
+ bs->entries[i].in_tx = 0;
+ bs->entries[i].abort = 0;
+ bs->entries[i].cycles = 0;
+ bs->entries[i].type = 0;
+ bs->entries[i].reserved = 0;
}
- cpuc->lbr_stack.nr = i;
- cpuc->lbr_stack.hw_idx = tos;
+ bs->nr = i;
+ bs->hw_idx = tos;
}

/*
@@ -820,7 +820,7 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
* is the same as the linear address, allowing us to merge the LIP and EIP
* LBR formats.
*/
-void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
{
bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -896,19 +896,19 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (abort && x86_pmu.lbr_double_abort && out > 0)
out--;

- cpuc->lbr_entries[out].from = from;
- cpuc->lbr_entries[out].to = to;
- cpuc->lbr_entries[out].mispred = mis;
- cpuc->lbr_entries[out].predicted = pred;
- cpuc->lbr_entries[out].in_tx = in_tx;
- cpuc->lbr_entries[out].abort = abort;
- cpuc->lbr_entries[out].cycles = cycles;
- cpuc->lbr_entries[out].type = 0;
- cpuc->lbr_entries[out].reserved = 0;
+ bs->entries[out].from = from;
+ bs->entries[out].to = to;
+ bs->entries[out].mispred = mis;
+ bs->entries[out].predicted = pred;
+ bs->entries[out].in_tx = in_tx;
+ bs->entries[out].abort = abort;
+ bs->entries[out].cycles = cycles;
+ bs->entries[out].type = 0;
+ bs->entries[out].reserved = 0;
out++;
}
- cpuc->lbr_stack.nr = out;
- cpuc->lbr_stack.hw_idx = tos;
+ bs->nr = out;
+ bs->hw_idx = tos;
}

static __always_inline int get_lbr_br_type(u64 info)
@@ -945,7 +945,8 @@ static __always_inline u16 get_lbr_cycles(u64 info)
}

static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
- struct lbr_entry *entries)
+ struct lbr_entry *entries,
+ struct perf_branch_stack *bs)
{
struct perf_branch_entry *e;
struct lbr_entry *lbr;
@@ -954,7 +955,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,

for (i = 0; i < x86_pmu.lbr_nr; i++) {
lbr = entries ? &entries[i] : NULL;
- e = &cpuc->lbr_entries[i];
+ e = &bs->entries[i];

from = rdlbr_from(i, lbr);
/*
@@ -977,28 +978,28 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
e->reserved = 0;
}

- cpuc->lbr_stack.nr = i;
+ bs->nr = i;
}

-static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
{
- intel_pmu_store_lbr(cpuc, NULL);
+ intel_pmu_store_lbr(cpuc, NULL, bs);
}

-static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
{
struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;

if (!xsave) {
- intel_pmu_store_lbr(cpuc, NULL);
+ intel_pmu_store_lbr(cpuc, NULL, bs);
return;
}
xsaves(&xsave->xsave, XFEATURE_MASK_LBR);

- intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+ intel_pmu_store_lbr(cpuc, xsave->lbr.entries, bs);
}

-void intel_pmu_lbr_read(void)
+void intel_pmu_lbr_read(struct perf_branch_stack *bs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

@@ -1012,9 +1013,8 @@ void intel_pmu_lbr_read(void)
cpuc->lbr_users == cpuc->lbr_pebs_users)
return;

- x86_pmu.lbr_read(cpuc);
-
- intel_pmu_lbr_filter(cpuc);
+ x86_pmu.lbr_read(cpuc, bs);
+ intel_pmu_lbr_filter(cpuc, bs);
}

/*
@@ -1402,7 +1402,7 @@ static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
* in PERF_SAMPLE_BRANCH_STACK sample may vary.
*/
static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs)
{
u64 from, to;
int br_sel = cpuc->br_sel;
@@ -1414,11 +1414,11 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
return;

- for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+ for (i = 0; i < bs->nr; i++) {

- from = cpuc->lbr_entries[i].from;
- to = cpuc->lbr_entries[i].to;
- type = cpuc->lbr_entries[i].type;
+ from = bs->entries[i].from;
+ to = bs->entries[i].to;
+ type = bs->entries[i].type;

/*
* Parse the branch type recorded in LBR_x_INFO MSR.
@@ -1430,9 +1430,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
type = arch_lbr_br_type_map[type] | to_plm;
} else
- type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+ type = branch_type(from, to, bs->entries[i].abort);
if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
- if (cpuc->lbr_entries[i].in_tx)
+ if (bs->entries[i].in_tx)
type |= X86_BR_IN_TX;
else
type |= X86_BR_NO_TX;
@@ -1440,25 +1440,25 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)

/* if type does not correspond, then discard */
if (type == X86_BR_NONE || (br_sel & type) != type) {
- cpuc->lbr_entries[i].from = 0;
+ bs->entries[i].from = 0;
compress = true;
}

if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
- cpuc->lbr_entries[i].type = common_branch_type(type);
+ bs->entries[i].type = common_branch_type(type);
}

if (!compress)
return;

/* remove all entries with from=0 */
- for (i = 0; i < cpuc->lbr_stack.nr; ) {
- if (!cpuc->lbr_entries[i].from) {
+ for (i = 0; i < bs->nr; ) {
+ if (!bs->entries[i].from) {
j = i;
- while (++j < cpuc->lbr_stack.nr)
- cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
- cpuc->lbr_stack.nr--;
- if (!cpuc->lbr_entries[i].from)
+ while (++j < bs->nr)
+ bs->entries[j-1] = bs->entries[j];
+ bs->nr--;
+ if (!bs->entries[i].from)
continue;
}
i++;
@@ -1476,8 +1476,8 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
else
cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();

- intel_pmu_store_lbr(cpuc, lbr);
- intel_pmu_lbr_filter(cpuc);
+ intel_pmu_store_lbr(cpuc, lbr, &cpuc->lbr_stack);
+ intel_pmu_lbr_filter(cpuc, &cpuc->lbr_stack);
}

/*
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e3ac05c97b5e..9700fb1f47c3 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -852,7 +852,7 @@ struct x86_pmu {
unsigned int lbr_br_type:1;

void (*lbr_reset)(void);
- void (*lbr_read)(struct cpu_hw_events *cpuc);
+ void (*lbr_read)(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);
void (*lbr_save)(void *ctx);
void (*lbr_restore)(void *ctx);

@@ -1345,11 +1345,11 @@ void intel_pmu_lbr_enable_all(bool pmi);

void intel_pmu_lbr_disable_all(void);

-void intel_pmu_lbr_read(void);
+void intel_pmu_lbr_read(struct perf_branch_stack *bs);

-void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);

-void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs);

void intel_pmu_lbr_save(void *ctx);