Re: [RFC PATCH] perf: Add load latency monitoring on Intel Nehalem/Westmere

From: Stephane Eranian
Date: Wed Dec 22 2010 - 05:08:13 EST


Hi,

On Wed, Dec 22, 2010 at 10:00 AM, Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> wrote:
> On Wed, 2010-12-22 at 16:12 +0800, Lin Ming wrote:
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>> index ed6ff11..2a02529 100644
>> --- a/arch/x86/kernel/cpu/perf_event.c
>> +++ b/arch/x86/kernel/cpu/perf_event.c
>> @@ -197,18 +197,25 @@ struct extra_reg {
>>     unsigned int      Âextra_shift;
>> Â Â Â Â u64 Â Â Â Â Â Â Â Â Â Â config_mask;
>> Â Â Â Â u64 Â Â Â Â Â Â Â Â Â Â valid_mask;
>> + Â Â Â u64 Â Â Â Â Â Â Â Â Â Â flags;
>> Â};
>>
>> -#define EVENT_EXTRA_REG(e, ms, m, vm, es) { Â Â\
>> +#define EVENT_EXTRA_REG(e, ms, m, vm, es, f) { \
>> Â Â Â Â .event = (e), Â Â Â Â Â \
>> Â Â Â Â .msr = (ms), Â Â Â Â Â Â\
>> Â Â Â Â .config_mask = (m), Â Â \
>> Â Â Â Â .valid_mask = (vm), Â Â \
>> Â Â Â Â .extra_shift = (es), Â Â\
>> + Â Â Â .flags = (f), Â \
>> Â Â Â Â }
>> Â#define INTEL_EVENT_EXTRA_REG(event, msr, vm, es) Â Â Â\
>> - Â Â Â EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, es)
>> -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, 0)
>> + Â Â Â EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, es, 0)
>> +#define INTEL_EVENT_EXTRA_REG2(event, msr, vm, es, f) Â\
>> + Â Â Â EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
>> + Â Â Â Â Â Â Â Â Â Â Â ARCH_PERFMON_EVENTSEL_UMASK, vm, es, f)
>> +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, 0, 0)
>
> You'll need to increment MAX_EXTRA_REGS to 3 I think.
>
>> +#define EXTRA_REG_LD_LAT 0x1
>
> I'm not quite sure we actually need the whole flags business.
>
>> Âunion perf_capabilities {
>> Â Â Â Â struct {
>> @@ -384,6 +391,11 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
>> Â Â Â Â Â Â Â Â if (extra & ~er->valid_mask)
>> Â Â Â Â Â Â Â Â Â Â Â Â return -EINVAL;
>> Â Â Â Â Â Â Â Â event->hw.extra_config = extra;
>> + Â Â Â Â Â Â Â event->hw.extra_flags = er->flags;
>> +
>> + Â Â Â Â Â Â Â /* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
>> + Â Â Â Â Â Â Â if ((er->flags & EXTRA_REG_LD_LAT) && extra < 3)
>> + Â Â Â Â Â Â Â Â Â Â Â event->hw.extra_config = 3;
>
> Â Â Â Âif (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && extra < 3)
> Â Â Â Â Â Â Â Âevent->hw.extra_config = 3;
>
>> Â Â Â Â Â Â Â Â break;
>> Â Â Â Â }
>> Â Â Â Â return 0;
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>> index bc4afb1..7e2b873 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>> @@ -89,6 +89,8 @@ static struct event_constraint intel_nehalem_event_constraints[] =
>> Âstatic struct extra_reg intel_nehalem_extra_regs[] =
>> Â{
>> Â Â Â Â INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff, 32), /* OFFCORE_RESPONSE */
>> + Â Â Â /* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
>> + Â Â Â INTEL_EVENT_EXTRA_REG2(0x100b, 0x3f6, 0xffff, 32, EXTRA_REG_LD_LAT),
>> Â Â Â Â EVENT_EXTRA_END
>> Â};
>
> Maybe use the MSR names instead of the numbers.
>
>
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
>> index b7dcd9f..d008c40 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
>> @@ -376,6 +376,7 @@ static struct event_constraint intel_core_pebs_events[] = {
>> Â};
>>
>> Âstatic struct event_constraint intel_nehalem_pebs_events[] = {
>> + Â Â Â PEBS_EVENT_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
>> Â Â Â Â PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
>> Â Â Â Â PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
>> Â Â Â Â PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
>> @@ -414,6 +415,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
>> Â Â Â Â hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
>>
>> Â Â Â Â cpuc->pebs_enabled |= 1ULL << hwc->idx;
>> + Â Â Â if (hwc->extra_flags & EXTRA_REG_LD_LAT)
>> + Â Â Â Â Â Â Â cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
>
> Â Â Â Âif (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
> Â Â Â Â Â Â Â Âcpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
>
>> Â Â Â Â WARN_ON_ONCE(cpuc->enabled);
>>
>> Â Â Â Â if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
>> @@ -426,6 +429,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
>> Â Â Â Â struct hw_perf_event *hwc = &event->hw;
>>
>> Â Â Â Â cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
>> + Â Â Â if (hwc->extra_flags & EXTRA_REG_LD_LAT)
>> + Â Â Â Â Â Â Â cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
>
> Â Â Â Âif (hwx->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
> Â Â Â Â Â Â Â Âcpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
>
>> Â Â Â Â if (cpuc->enabled)
>> Â Â Â Â Â Â Â Â wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index d24d9ab..38bffa4 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -541,6 +541,7 @@ struct hw_perf_event {
>>             int       last_cpu;
>>             unsigned int  Âextra_reg;
>> Â Â Â Â Â Â Â Â Â Â Â Â u64 Â Â Â Â Â Â extra_config;
>> + Â Â Â Â Â Â Â Â Â Â Â u64 Â Â Â Â Â Â extra_flags;
>> Â Â Â Â Â Â Â Â };
>> Â Â Â Â Â Â Â Â struct { /* software */
>> Â Â Â Â Â Â Â Â Â Â Â Â struct hrtimer Âhrtimer;
>>
>
> Which then also obviates the need for this extra field.
>
> You also need some extra goo in intel_pmu_drain_pebs_nhm(), we can
> already use the PERF_SAMPLE_ADDR for the linear data address provided by
> the pebs-ll thing, and we might need to add:
>
> ÂPERF_SAMPLE_LATENCY -- Stephane said other archs can also use this
>
Extracting the instruction address is not so useful. You need the
instruction and data addresses, the latency and data source. As Peter
pointed out, you can use PERF_SAMPLE_ADDR for the data address.

True. And also we would need a PERF_SAMPLE_DATA_SRC to extract
the data source information. Other archs also have that.

Note that PEBS-Load latency needs the IP+1 correction. It points to the
instruction address after the load/lfetch. But I suspect your patch already
takes care of that.

> Not quite sure what to do for the source bits, POWER also has some extra
> bits, but I'm not sure they qualify as purely source bits. And
> interpreting them is going to be inherently arch specific, which
> sucks :/
>
>
Yes, I think there is more to it than just data source, unfortunately.
If you want to avoid returning an opaque u64 (PERF_SAMPLE_EXTRA), then
you need to break it down: PERF_SAMPLE_DATA_SRC, PERF_SAMPLE_XX
and so on.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/