Re: [PATCH v7 10/10] perf/x86/rapl: Add core energy counter support for AMD CPUs

From: Dhananjay Ugwekar
Date: Mon Jan 13 2025 - 01:35:02 EST


On 1/12/2025 7:12 PM, Koichiro Den wrote:
> On Fri, Nov 15, 2024 at 06:08:06AM GMT, Dhananjay Ugwekar wrote:
>> Add a new "power_core" PMU and "energy-core" event for monitoring
>> energy consumption by each individual core. The existing energy-cores
>> event aggregates the energy consumption of CPU cores at the package level.
>> This new event aligns with the AMD's per-core energy counters.
>>
>> Tested the package level and core level PMU counters with workloads
>> pinned to different CPUs.
>>
>> Results with workload pinned to CPU 4 in core 4 on an AMD Zen4 Genoa
>> machine:
>>
>> $ sudo perf stat --per-core -e power_core/energy-core/ -- taskset -c 4 stress-ng --matrix 1 --timeout 5s
>> stress-ng: info: [21250] setting to a 5 second run per stressor
>> stress-ng: info: [21250] dispatching hogs: 1 matrix
>> stress-ng: info: [21250] successful run completed in 5.00s
>>
>> Performance counter stats for 'system wide':
>>
>> S0-D0-C0 1 0.00 Joules power_core/energy-core/
>> S0-D0-C1 1 0.00 Joules power_core/energy-core/
>> S0-D0-C2 1 0.00 Joules power_core/energy-core/
>> S0-D0-C3 1 0.00 Joules power_core/energy-core/
>> S0-D0-C4 1 8.43 Joules power_core/energy-core/
>> S0-D0-C5 1 0.00 Joules power_core/energy-core/
>> S0-D0-C6 1 0.00 Joules power_core/energy-core/
>> S0-D0-C7 1 0.00 Joules power_core/energy-core/
>> S0-D1-C8 1 0.00 Joules power_core/energy-core/
>> S0-D1-C9 1 0.00 Joules power_core/energy-core/
>> S0-D1-C10 1 0.00 Joules power_core/energy-core/
>>
>> Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@xxxxxxx>
>> Reviewed-by: Gautham R. Shenoy <gautham.shenoy@xxxxxxx>
>> ---
>> arch/x86/events/rapl.c | 185 +++++++++++++++++++++++++++++++++--------
>> 1 file changed, 152 insertions(+), 33 deletions(-)
>>
>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>> index 6e51386ff91f..e9be1f31163d 100644
>> --- a/arch/x86/events/rapl.c
>> +++ b/arch/x86/events/rapl.c
>> @@ -39,6 +39,10 @@
>> * event: rapl_energy_psys
>> * perf code: 0x5
>> *
>> + * core counter: consumption of a single physical core
>> + * event: rapl_energy_core (power_core PMU)
>> + * perf code: 0x1
>> + *
>> * We manage those counters as free running (read-only). They may be
>> * use simultaneously by other tools, such as turbostat.
>> *
>> @@ -81,6 +85,10 @@ enum perf_rapl_pkg_events {
>> NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
>> };
>>
>> +#define PERF_RAPL_CORE 0 /* single core */
>> +#define PERF_RAPL_CORE_EVENTS_MAX 1
>> +#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
>> +
>> static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
>> "pp0-core",
>> "package",
>> @@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst
>> "psys",
>> };
>>
>> +static const char *const rapl_core_domain_name __initconst = "core";
>> +
>> /*
>> * event code: LSB 8 bits, passed in attr->config
>> * any other bit is reserved
>> @@ -141,14 +151,18 @@ enum rapl_unit_quirk {
>>
>> struct rapl_model {
>> struct perf_msr *rapl_pkg_msrs;
>> + struct perf_msr *rapl_core_msrs;
>> unsigned long pkg_events;
>> + unsigned long core_events;
>> unsigned int msr_power_unit;
>> enum rapl_unit_quirk unit_quirk;
>> };
>>
>> /* 1/2^hw_unit Joule */
>> static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
>> +static int rapl_core_hw_unit __read_mostly;
>> static struct rapl_pmus *rapl_pmus_pkg;
>> +static struct rapl_pmus *rapl_pmus_core;
>> static u64 rapl_timer_ms;
>> static struct rapl_model *rapl_model;
>>
>> @@ -156,14 +170,23 @@ static struct rapl_model *rapl_model;
>> * Helper function to get the correct topology id according to the
>> * RAPL PMU scope.
>> */
>> -static inline unsigned int get_rapl_pmu_idx(int cpu)
>> -{ /*
>> +static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
>> +{
>> + /*
>> * Returns unsigned int, which converts the '-1' return value
>> * (for non-existent mappings in topology map) to UINT_MAX, so
>> * the error check in the caller is simplified.
>> */
>> - return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
>> - topology_logical_die_id(cpu);
>> + switch (scope) {
>> + case PERF_PMU_SCOPE_PKG:
>> + return topology_logical_package_id(cpu);
>> + case PERF_PMU_SCOPE_DIE:
>> + return topology_logical_die_id(cpu);
>> + case PERF_PMU_SCOPE_CORE:
>> + return topology_logical_core_id(cpu);
>> + default:
>> + return -EINVAL;
>> + }
>> }
>>
>> static inline u64 rapl_read_counter(struct perf_event *event)
>> @@ -173,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
>> return raw;
>> }
>>
>> -static inline u64 rapl_scale(u64 v, int cfg)
>> +static inline u64 rapl_scale(u64 v, struct perf_event *event)
>> {
>> - if (cfg > NR_RAPL_PKG_DOMAINS) {
>> - pr_warn("Invalid domain %d, failed to scale data\n", cfg);
>> - return v;
>> - }
>> + int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
>> +
>> + if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
>> + hw_unit = rapl_core_hw_unit;
>> +
>> /*
>> * scale delta to smallest unit (1/2^32)
>> * users must then scale back: count * 1/(1e9*2^32) to get Joules
>> * or use ldexp(count, -32).
>> * Watts = Joules/Time delta
>> */
>> - return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
>> + return v << (32 - hw_unit);
>> }
>>
>> static u64 rapl_event_update(struct perf_event *event)
>> @@ -212,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
>> delta = (new_raw_count << shift) - (prev_raw_count << shift);
>> delta >>= shift;
>>
>> - sdelta = rapl_scale(delta, event->hw.config);
>> + sdelta = rapl_scale(delta, event);
>>
>> local64_add(sdelta, &event->count);
>>
>> @@ -341,13 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
>> static int rapl_pmu_event_init(struct perf_event *event)
>> {
>> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
>> - int bit, ret = 0;
>> + int bit, rapl_pmus_scope, ret = 0;
>> struct rapl_pmu *rapl_pmu;
>> unsigned int rapl_pmu_idx;
>> + struct rapl_pmus *rapl_pmus;
>>
>> - /* only look at RAPL events */
>> - if (event->attr.type != rapl_pmus_pkg->pmu.type)
>> - return -ENOENT;
>> + /* unsupported modes and filters */
>> + if (event->attr.sample_period) /* no sampling */
>> + return -EINVAL;
>
> Hi Dhananjay,
>
> On linux-next, since this commit, it seems that simple sampling with 'perf
> record -- <command>' (i.e. the default event), 'perf top' etc. can
> unexpectedly fail because rapl_pmu_event_init() now returns -EINVAL instead
> of -ENOENT even in such cases of a type mismatch. I observed that this
> prevents evsel__fallback() from falling back to cpu-clock or task-clock.
>
> Should we reorder the checks in rapl_pmu_event_init() to allow an early
> return with -ENOENT in such cases, as shown below? I'm not very familiar
> with this area and I might be missing something. I'd appreciate it if you
> could share your thoughts.
>
> --- a/arch/x86/events/rapl.c
> +++ b/arch/x86/events/rapl.c
> @@ -370,17 +370,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
> unsigned int rapl_pmu_idx;
> struct rapl_pmus *rapl_pmus;
>
> - /* unsupported modes and filters */
> - if (event->attr.sample_period) /* no sampling */
> - return -EINVAL;
> -
> - /* check only supported bits are set */
> - if (event->attr.config & ~RAPL_EVENT_MASK)
> - return -EINVAL;
> -
> - if (event->cpu < 0)
> - return -EINVAL;
> -
> rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
> if (!rapl_pmus)
> return -EINVAL;
> @@ -411,6 +400,17 @@ static int rapl_pmu_event_init(struct perf_event *event)
> } else
> return -EINVAL;
>
> + /* unsupported modes and filters */
> + if (event->attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + /* check only supported bits are set */
> + if (event->attr.config & ~RAPL_EVENT_MASK)
> + return -EINVAL;
> +
> + if (event->cpu < 0)
> + return -EINVAL;
> +
> /* check event supported */
> if (!(rapl_pmus->cntr_mask & (1 << bit)))
> return -EINVAL;
>
> Thanks.
>
> -Koichiro

Hello Koichiro,

I tried reproducing the issue you mentioned using "sudo perf record -- sleep 2" and
"sudo perf top" commands on an AMD EPYC system, the commands worked successfully.
Can you please mention which system and which exact commands you're
running that reproduced the issue?

My analysis is, if we are running "perf record/top" with the default event, we would
not enter the rapl_pmu_event_init() function, which renders the reordering of the type
checks irrelevant. Regardless, please let me know how I can reproduce the issue.

Thanks,
Dhananjay