Re: [PATCH V2 1/1] perf/x86: Add Intel power cstate PMUs support

From: Stephane Eranian
Date: Thu Aug 06 2015 - 14:14:24 EST


On Thu, Aug 6, 2015 at 8:44 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Mon, Jul 27, 2015 at 04:46:16AM -0400, kan.liang@xxxxxxxxx wrote:
>
> As a general comment; this thing is unreadable. Far too much macro foo
> to instantiate the different PMUs.
>
>> +struct perf_power_cstate_event_msr {
>> + int id;
>> + u64 msr;
>> +};
>> +
>> +enum perf_power_cstate_id {
>> + /*
>> + * power_cstate events, generalized by the kernel:
>> + */
>> + PERF_POWER_CORE_C1_RES = 0,
>> + PERF_POWER_CORE_C3_RES,
>> + PERF_POWER_CORE_C6_RES,
>> + PERF_POWER_CORE_C7_RES,
>
> These are two different PMUs, why are they in the same enum space?
>
>> + PERF_POWER_PKG_C2_RES,
>> + PERF_POWER_PKG_C3_RES,
>> + PERF_POWER_PKG_C6_RES,
>> + PERF_POWER_PKG_C7_RES,
>> + PERF_POWER_PKG_C8_RES,
>> + PERF_POWER_PKG_C9_RES,
>> + PERF_POWER_PKG_C10_RES,
>> +
>> + PERF_POWER_CSTATE_EVENT_MAX, /* non-ABI */
>> +};
>> +
>> +struct power_cstate_pmu {
>> + struct intel_power_cstate_type *power_cstate_type;
>> + struct pmu *pmu;
>> +};
>> +
>> +
>> +static struct intel_power_cstate_type *empty_power_cstate[] = { NULL, };
>> +struct intel_power_cstate_type **power_cstate = empty_power_cstate;
>> +
>> +static struct perf_power_cstate_event_msr power_cstate_events[] = {
>> + { PERF_POWER_CORE_C1_RES, MSR_CORE_C1_RES },
>> + { PERF_POWER_CORE_C3_RES, MSR_CORE_C3_RESIDENCY },
>> + { PERF_POWER_CORE_C6_RES, MSR_CORE_C6_RESIDENCY },
>> + { PERF_POWER_CORE_C7_RES, MSR_CORE_C7_RESIDENCY },
>> + { PERF_POWER_PKG_C2_RES, MSR_PKG_C2_RESIDENCY },
>> + { PERF_POWER_PKG_C3_RES, MSR_PKG_C3_RESIDENCY },
>> + { PERF_POWER_PKG_C6_RES, MSR_PKG_C6_RESIDENCY },
>> + { PERF_POWER_PKG_C7_RES, MSR_PKG_C7_RESIDENCY },
>> + { PERF_POWER_PKG_C8_RES, MSR_PKG_C8_RESIDENCY },
>> + { PERF_POWER_PKG_C9_RES, MSR_PKG_C9_RESIDENCY },
>> + { PERF_POWER_PKG_C10_RES, MSR_PKG_C10_RESIDENCY },
>> +};
>
> What's the point of the first entry? You already know which index it is,
> no point in storing that again.
>
>> +
>> +EVENT_ATTR_STR(c1-residency, power_core_c1_res, "event=0x00");

I would not use an event code of 0. Start at 0x1

>> +EVENT_ATTR_STR(c3-residency, power_core_c3_res, "event=0x01");
>> +EVENT_ATTR_STR(c6-residency, power_core_c6_res, "event=0x02");
>> +EVENT_ATTR_STR(c7-residency, power_core_c7_res, "event=0x03");
>
>> +EVENT_ATTR_STR(c2-residency, power_pkg_c2_res, "event=0x04");
>
> I would very much expect this thing to start counting at 0 again.
>
>> +EVENT_ATTR_STR(c3-residency, power_pkg_c3_res, "event=0x05");
>> +EVENT_ATTR_STR(c6-residency, power_pkg_c6_res, "event=0x06");
>> +EVENT_ATTR_STR(c7-residency, power_pkg_c7_res, "event=0x07");
>> +EVENT_ATTR_STR(c8-residency, power_pkg_c8_res, "event=0x08");
>> +EVENT_ATTR_STR(c9-residency, power_pkg_c9_res, "event=0x09");
>> +EVENT_ATTR_STR(c10-residency, power_pkg_c10_res, "event=0x0a");
>> +
>> +static cpumask_t power_cstate_core_cpu_mask;
>
> That one typically does not need a cpumask.
>
You need to pick one CPU out of the multi-core. But it is for client parts thus
there is only one socket. At least this is my understanding.

>> +static cpumask_t power_cstate_pkg_cpu_mask;
>
>> +static int power_cstate_pmu_event_init(struct perf_event *event)
>> +{
>> + u64 cfg = event->attr.config;
>> + int ret = 0;
>> +
>> + if (event->attr.type != event->pmu->type)
>> + return -ENOENT;
>> +
>> + /*
>> + * check event is known (determines counter)
>> + */
>> + if (cfg >= PERF_POWER_CSTATE_EVENT_MAX)
>> + return -EINVAL;
>> +
>> + /* unsupported modes and filters */
>> + if (event->attr.exclude_user ||
>> + event->attr.exclude_kernel ||
>> + event->attr.exclude_hv ||
>> + event->attr.exclude_idle ||
>> + event->attr.exclude_host ||
>> + event->attr.exclude_guest ||
>> + event->attr.sample_period) /* no sampling */
>> + return -EINVAL;
>
> Same as with the MSR thing, but worse. What happens if we hand a config
> value for the 'wrong' PMU type?
>
I think that's taken care of by the first test in the function.

> What happens if we hand a pkg c10 config to one that doesn't have c10?
>
>> + /* must be done before validate_group */
>> + event->hw.event_base = power_cstate_events[cfg].msr;
>> + event->hw.config = cfg;
>> + event->hw.idx = -1;
>> +
>> + return ret;
>> +}
>> +
>> +static inline u64 power_cstate_pmu_read_counter(struct perf_event *event)
>> +{
>> + u64 val;
>> +
>> + rdmsrl_safe(event->hw.event_base, &val);
>
> No, at this point you had better know if that MSR exists or not. If it
> does not the event should not exist.
>
>> + return val;
>> +}
>
>
I understand that these metrics are useful and needed however if I
look at the broader
picture I see many PMUs doing similar things or appearing different
when they are actually
very close. It would be nice to have a more unified approach. You have
RAPL (client, server)
which appears as the power PMU. You have the PCU uncore on servers
which also provides
C-state residency info. Yet, all these appear differently and expose
events with different names.
I think we could benefit from a more unifie approach here such that
you would be able to do

$ perf stat -a -e power/c6-residency/, power/energy-pkg/

on client and server without having to change the pmu name of the event names.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/