Re: [PATCH v2 4/7] perf, x86: large PEBS interrupt threshold

From: Peter Zijlstra
Date: Tue Jul 15 2014 - 07:39:03 EST


On Tue, Jul 15, 2014 at 04:58:56PM +0800, Yan, Zheng wrote:
> Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/perf_event.h | 1 +
> arch/x86/kernel/cpu/perf_event_intel_ds.c | 98 +++++++++++++++++++++---------
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 --
> 3 files changed, 71 insertions(+), 33 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index d8165f3..cb7cda8 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -450,6 +450,7 @@ struct x86_pmu {
> struct event_constraint *pebs_constraints;
> void (*pebs_aliases)(struct perf_event *event);
> int max_pebs_events;
> + bool multi_pebs;

This needs to die.

> /*
> * Intel LBR
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 1db4ce5..e17eb5b 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -11,7 +11,7 @@
> #define BTS_RECORD_SIZE 24
>
> #define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
> -#define PEBS_BUFFER_SIZE PAGE_SIZE
> +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)

See: http://lkml.kernel.org/r/alpine.DEB.2.02.1406301600460.26302@xxxxxxxxxxxxxxxxxxxxxxxxx

Also talk about why 64k, mention NMI duration/processing overhead etc..

> @@ -708,14 +705,29 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
> return &emptyconstraint;
> }
>
> +/*
> + * Flags PEBS can handle without an PMI.
> + *
> + * TID can only be handled by flushing at context switch.
> + */
> +#define PEBS_FREERUNNING_FLAGS \
> + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
> + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
> + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> + PERF_SAMPLE_TRANSACTION)
> +
> void intel_pmu_pebs_enable(struct perf_event *event)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> struct hw_perf_event *hwc = &event->hw;
> + struct debug_store *ds = cpuc->ds;
> + u64 threshold;
> + bool first_pebs;

flip those two lines

>
> hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
> hwc->autoreload = !event->attr.freq;
>
> + first_pebs = !(cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
> cpuc->pebs_enabled |= 1ULL << hwc->idx;
>
> if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
> @@ -723,6 +735,20 @@ void intel_pmu_pebs_enable(struct perf_event *event)
> else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
> cpuc->pebs_enabled |= 1ULL << 63;
>
> + /*
> + * When the event is constrained enough we can use a larger
> + * threshold and run the event with less frequent PMI.
> + */
> + if (x86_pmu.multi_pebs && hwc->autoreload &&
> + !(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) {
> + threshold = ds->pebs_absolute_maximum -
> + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
> + } else {
> + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
> + }

threshold = 1;
if ((hwc->flags & PERF_X86_EVENT_PEBS_RELOAD) &&
!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
threshold = x86_pmu.max_pebs_events;

threshold = ds->pebs_buffer_base + threshold * x86_pmu.pebs_record_size;

> + if (first_pebs || ds->pebs_interrupt_threshold > threshold)
> + ds->pebs_interrupt_threshold = threshold;
> +
> /* Use auto-reload if possible to save a MSR write in the PMI */
> if (hwc->autoreload)
> ds->pebs_event_reset[hwc->idx] =

> @@ -880,7 +907,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
> u64 sample_type;
> int fll, fst;
>
> - if (!intel_pmu_save_and_restart(event))
> + if (first_record && !intel_pmu_save_and_restart(event))
> return;
>
> fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
> @@ -956,8 +983,22 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
> if (has_branch_stack(event))
> data.br_stack = &cpuc->lbr_stack;
>
> - if (perf_event_overflow(event, &data, &regs))
> - x86_pmu_stop(event, 0);
> + if (first_record) {
> + if (perf_event_overflow(event, &data, &regs))
> + x86_pmu_stop(event, 0);
> + } else {
> + struct perf_output_handle handle;
> + struct perf_event_header header;
> +
> + perf_prepare_sample(&header, &data, event, &regs);
> +
> + if (perf_output_begin(&handle, event, header.size))
> + return;
> +
> + perf_output_sample(&handle, &header, &data, event);
> +
> + perf_output_end(&handle);
> + }

That is disgusting, have a look at drain_bts_buffer() and try again.

> }
>
> static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
> @@ -998,17 +1039,18 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
> WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
> at += n - 1;
>
> - __intel_pmu_pebs_event(event, iregs, at);
> + __intel_pmu_pebs_event(event, iregs, at, true);
> }
>
> static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> struct debug_store *ds = cpuc->ds;
> - struct perf_event *event = NULL;
> + struct perf_event *event;
> void *at, *top;
> u64 status = 0;
> int bit;
> + bool multi_pebs, first_record;

These should not be needed, but its also at the wrong place if it were.

> if (!x86_pmu.pebs_active)
> return;

> @@ -1042,17 +1086,15 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
>
> if (!event->attr.precise_ip)
> continue;
> -
> - if (__test_and_set_bit(bit, (unsigned long *)&status))
> - continue;
> -
> - break;
> + if (!__test_and_set_bit(bit, (unsigned long *)&status)) {
> + first_record = true;
> + } else {
> + if (!multi_pebs)
> + continue;
> + first_record = false;
> + }
> + __intel_pmu_pebs_event(event, iregs, at, first_record);
> }
> -
> - if (!event || bit >= x86_pmu.max_pebs_events)
> - continue;
> -
> - __intel_pmu_pebs_event(event, iregs, at);

Distinct lack of properly handling the multi overflow case.

> }
> }
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index d6d5fcf..430f1ad 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -184,10 +184,6 @@ void intel_pmu_lbr_reset(void)
> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> -
> - if (!x86_pmu.lbr_nr)
> - return;
> -
> /*
> * It is necessary to flush the stack on context switch. This happens
> * when the branch stack does not tag its entries with the pid of the
> @@ -408,7 +404,6 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
>
> if (br_type & PERF_SAMPLE_BRANCH_COND)
> mask |= X86_BR_JCC;
> -
> /*
> * stash actual user request into reg, it may
> * be used by fixup code for some CPU

WTF?

Attachment: pgpOu6iwIFa6c.pgp
Description: PGP signature