Re: [PATCH v7 10/10] perf/x86/rapl: Add core energy counter support for AMD CPUs

From: Dhananjay Ugwekar
Date: Mon Jan 20 2025 - 06:43:17 EST


On 1/15/2025 7:53 PM, Koichiro Den wrote:
> On Mon, Jan 13, 2025 at 12:04:43PM GMT, Dhananjay Ugwekar wrote:
>> On 1/12/2025 7:12 PM, Koichiro Den wrote:
>>> On Fri, Nov 15, 2024 at 06:08:06AM GMT, Dhananjay Ugwekar wrote:
>>>> Add a new "power_core" PMU and "energy-core" event for monitoring
>>>> energy consumption by each individual core. The existing energy-cores
>>>> event aggregates the energy consumption of CPU cores at the package level.
>>>> This new event aligns with the AMD's per-core energy counters.
>>>>
>>>> Tested the package level and core level PMU counters with workloads
>>>> pinned to different CPUs.
>>>>
>>>> Results with workload pinned to CPU 4 in core 4 on an AMD Zen4 Genoa
>>>> machine:
>>>>
>>>> $ sudo perf stat --per-core -e power_core/energy-core/ -- taskset -c 4 stress-ng --matrix 1 --timeout 5s
>>>> stress-ng: info: [21250] setting to a 5 second run per stressor
>>>> stress-ng: info: [21250] dispatching hogs: 1 matrix
>>>> stress-ng: info: [21250] successful run completed in 5.00s
>>>>
>>>> Performance counter stats for 'system wide':
>>>>
>>>> S0-D0-C0 1 0.00 Joules power_core/energy-core/
>>>> S0-D0-C1 1 0.00 Joules power_core/energy-core/
>>>> S0-D0-C2 1 0.00 Joules power_core/energy-core/
>>>> S0-D0-C3 1 0.00 Joules power_core/energy-core/
>>>> S0-D0-C4 1 8.43 Joules power_core/energy-core/
>>>> S0-D0-C5 1 0.00 Joules power_core/energy-core/
>>>> S0-D0-C6 1 0.00 Joules power_core/energy-core/
>>>> S0-D0-C7 1 0.00 Joules power_core/energy-core/
>>>> S0-D1-C8 1 0.00 Joules power_core/energy-core/
>>>> S0-D1-C9 1 0.00 Joules power_core/energy-core/
>>>> S0-D1-C10 1 0.00 Joules power_core/energy-core/
>>>>
>>>> Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@xxxxxxx>
>>>> Reviewed-by: Gautham R. Shenoy <gautham.shenoy@xxxxxxx>
>>>> ---
>>>> arch/x86/events/rapl.c | 185 +++++++++++++++++++++++++++++++++--------
>>>> 1 file changed, 152 insertions(+), 33 deletions(-)
>>>>
>>>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>>>> index 6e51386ff91f..e9be1f31163d 100644
>>>> --- a/arch/x86/events/rapl.c
>>>> +++ b/arch/x86/events/rapl.c
>>>> @@ -39,6 +39,10 @@
>>>> * event: rapl_energy_psys
>>>> * perf code: 0x5
>>>> *
>>>> + * core counter: consumption of a single physical core
>>>> + * event: rapl_energy_core (power_core PMU)
>>>> + * perf code: 0x1
>>>> + *
>>>> * We manage those counters as free running (read-only). They may be
>>>> * use simultaneously by other tools, such as turbostat.
>>>> *
>>>> @@ -81,6 +85,10 @@ enum perf_rapl_pkg_events {
>>>> NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
>>>> };
>>>>
>>>> +#define PERF_RAPL_CORE 0 /* single core */
>>>> +#define PERF_RAPL_CORE_EVENTS_MAX 1
>>>> +#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
>>>> +
>>>> static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
>>>> "pp0-core",
>>>> "package",
>>>> @@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst
>>>> "psys",
>>>> };
>>>>
>>>> +static const char *const rapl_core_domain_name __initconst = "core";
>>>> +
>>>> /*
>>>> * event code: LSB 8 bits, passed in attr->config
>>>> * any other bit is reserved
>>>> @@ -141,14 +151,18 @@ enum rapl_unit_quirk {
>>>>
>>>> struct rapl_model {
>>>> struct perf_msr *rapl_pkg_msrs;
>>>> + struct perf_msr *rapl_core_msrs;
>>>> unsigned long pkg_events;
>>>> + unsigned long core_events;
>>>> unsigned int msr_power_unit;
>>>> enum rapl_unit_quirk unit_quirk;
>>>> };
>>>>
>>>> /* 1/2^hw_unit Joule */
>>>> static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
>>>> +static int rapl_core_hw_unit __read_mostly;
>>>> static struct rapl_pmus *rapl_pmus_pkg;
>>>> +static struct rapl_pmus *rapl_pmus_core;
>>>> static u64 rapl_timer_ms;
>>>> static struct rapl_model *rapl_model;
>>>>
>>>> @@ -156,14 +170,23 @@ static struct rapl_model *rapl_model;
>>>> * Helper function to get the correct topology id according to the
>>>> * RAPL PMU scope.
>>>> */
>>>> -static inline unsigned int get_rapl_pmu_idx(int cpu)
>>>> -{ /*
>>>> +static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
>>>> +{
>>>> + /*
>>>> * Returns unsigned int, which converts the '-1' return value
>>>> * (for non-existent mappings in topology map) to UINT_MAX, so
>>>> * the error check in the caller is simplified.
>>>> */
>>>> - return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
>>>> - topology_logical_die_id(cpu);
>>>> + switch (scope) {
>>>> + case PERF_PMU_SCOPE_PKG:
>>>> + return topology_logical_package_id(cpu);
>>>> + case PERF_PMU_SCOPE_DIE:
>>>> + return topology_logical_die_id(cpu);
>>>> + case PERF_PMU_SCOPE_CORE:
>>>> + return topology_logical_core_id(cpu);
>>>> + default:
>>>> + return -EINVAL;
>>>> + }
>>>> }
>>>>
>>>> static inline u64 rapl_read_counter(struct perf_event *event)
>>>> @@ -173,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
>>>> return raw;
>>>> }
>>>>
>>>> -static inline u64 rapl_scale(u64 v, int cfg)
>>>> +static inline u64 rapl_scale(u64 v, struct perf_event *event)
>>>> {
>>>> - if (cfg > NR_RAPL_PKG_DOMAINS) {
>>>> - pr_warn("Invalid domain %d, failed to scale data\n", cfg);
>>>> - return v;
>>>> - }
>>>> + int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
>>>> +
>>>> + if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
>>>> + hw_unit = rapl_core_hw_unit;
>>>> +
>>>> /*
>>>> * scale delta to smallest unit (1/2^32)
>>>> * users must then scale back: count * 1/(1e9*2^32) to get Joules
>>>> * or use ldexp(count, -32).
>>>> * Watts = Joules/Time delta
>>>> */
>>>> - return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
>>>> + return v << (32 - hw_unit);
>>>> }
>>>>
>>>> static u64 rapl_event_update(struct perf_event *event)
>>>> @@ -212,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
>>>> delta = (new_raw_count << shift) - (prev_raw_count << shift);
>>>> delta >>= shift;
>>>>
>>>> - sdelta = rapl_scale(delta, event->hw.config);
>>>> + sdelta = rapl_scale(delta, event);
>>>>
>>>> local64_add(sdelta, &event->count);
>>>>
>>>> @@ -341,13 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
>>>> static int rapl_pmu_event_init(struct perf_event *event)
>>>> {
>>>> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
>>>> - int bit, ret = 0;
>>>> + int bit, rapl_pmus_scope, ret = 0;
>>>> struct rapl_pmu *rapl_pmu;
>>>> unsigned int rapl_pmu_idx;
>>>> + struct rapl_pmus *rapl_pmus;
>>>>
>>>> - /* only look at RAPL events */
>>>> - if (event->attr.type != rapl_pmus_pkg->pmu.type)
>>>> - return -ENOENT;
>>>> + /* unsupported modes and filters */
>>>> + if (event->attr.sample_period) /* no sampling */
>>>> + return -EINVAL;
>>>
>>> Hi Dhananjay,
>>>
>>> On linux-next, since this commit, it seems that simple sampling with 'perf
>>> record -- <command>' (i.e. the default event), 'perf top' etc. can
>>> unexpectedly fail because rapl_pmu_event_init() now returns -EINVAL instead
>>> of -ENOENT even in such cases of a type mismatch. I observed that this
>>> prevents evsel__fallback() from falling back to cpu-clock or task-clock.
>>>
>>> Should we reorder the checks in rapl_pmu_event_init() to allow an early
>>> return with -ENOENT in such cases, as shown below? I'm not very familiar
>>> with this area and I might be missing something. I'd appreciate it if you
>>> could share your thoughts.
>>>
>>> --- a/arch/x86/events/rapl.c
>>> +++ b/arch/x86/events/rapl.c
>>> @@ -370,17 +370,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>> unsigned int rapl_pmu_idx;
>>> struct rapl_pmus *rapl_pmus;
>>>
>>> - /* unsupported modes and filters */
>>> - if (event->attr.sample_period) /* no sampling */
>>> - return -EINVAL;
>>> -
>>> - /* check only supported bits are set */
>>> - if (event->attr.config & ~RAPL_EVENT_MASK)
>>> - return -EINVAL;
>>> -
>>> - if (event->cpu < 0)
>>> - return -EINVAL;
>>> -
>>> rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
>>> if (!rapl_pmus)
>>> return -EINVAL;
>>> @@ -411,6 +400,17 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>> } else
>>> return -EINVAL;
>>>
>>> + /* unsupported modes and filters */
>>> + if (event->attr.sample_period) /* no sampling */
>>> + return -EINVAL;
>>> +
>>> + /* check only supported bits are set */
>>> + if (event->attr.config & ~RAPL_EVENT_MASK)
>>> + return -EINVAL;
>>> +
>>> + if (event->cpu < 0)
>>> + return -EINVAL;
>>> +
>>> /* check event supported */
>>> if (!(rapl_pmus->cntr_mask & (1 << bit)))
>>> return -EINVAL;
>>>
>>> Thanks.
>>>
>>> -Koichiro
>>
>> Hello Koichiro,
>>
>> I tried reproducing the issue you mentioned using "sudo perf record -- sleep 2" and
>> "sudo perf top" commands on an AMD EPYC system, the commands worked successfully.
>> Can you please mention which system and which exact commands you're
>> running that reproduced the issue?
>>
>> My analysis is, if we are running "perf record/top" with the default event, we would
>> not enter the rapl_pmu_event_init() function, which renders the reordering of the type
>> checks irrelevant. Regardless, please let me know how I can reproduce the issue.
>>
>> Thanks,
>> Dhananjay
>
> Hi,
>
> Apologies for the delayed response, and thank you for your comment. I
> confirmed that just running "perf top" on a qemu instance reproduces it.
> The host CPU model is Intel Core i9-13900K, which is passed through to
> the guest.
>
> In my case, no pmu for PERF_TYPE_RAW is registered, but the rapl pmu is
> present. Then, perf_init_event() reaches the line marked "---->" below, and
> rapl_pmu_event_init() run, which returns -EINVAL before the type check.
>
> static struct pmu *perf_init_event(struct perf_event *event)
> {
> [...]
> if (pmu) {
> [...]
> goto unlock;
> }
>
> list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
> ----> ret = perf_try_init_event(pmu, event);
> if (!ret)
> goto unlock;
>
> if (ret != -ENOENT) {
> pmu = ERR_PTR(ret);
> goto unlock;
> }
> }
>
> I'll look into this a bit more on my side later and get back to you if
> something becomes clear.

Sorry for the delayed response, can you please try the below diff and let me know
if it fixes the issue?

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index d3bb3865c1b1..4952faf03e82 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -370,6 +370,10 @@ static int rapl_pmu_event_init(struct perf_event *event)
unsigned int rapl_pmu_idx;
struct rapl_pmus *rapl_pmus;

+ /* only look at RAPL events */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
@@ -387,10 +391,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
rapl_pmus_scope = rapl_pmus->pmu.scope;

if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
- /* only look at RAPL package events */
- if (event->attr.type != rapl_pmus_pkg->pmu.type)
- return -ENOENT;
-
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
@@ -398,10 +398,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
- /* only look at RAPL core events */
- if (event->attr.type != rapl_pmus_core->pmu.type)
- return -ENOENT;
-
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;

>
> Thanks,
>
> -Koichiro