Re: [PATCH V2 2/3] powercap: intel_rapl: Introduce APIs for PMU support

From: Zhang, Rui
Date: Wed Apr 17 2024 - 01:26:39 EST


Hi, Rafael,

Thanks for reviewing.
Will refresh the patch based on your feedback, just a few coments
below.

> > +
> > +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
> > +{
> > +       if (!rp->has_pmu)
> > +               return false;
> > +
> > +       if (rp->lead_cpu >= 0)
> > +               return cpu == rp->lead_cpu;
>
> So if the given CPU is not the lead CPU, but it is located in the
> same
> package as the lead CPU, the function will return 'false'.  TBH, this
> is somewhat confusing.
>
The above code actually applies to MSR RAPL because TPMI RAPL has
lead_cpu < 0.

Instead, I can use something like below to avoid the confusion.
if (rp->priv->type != RAPL_IF_TPMI)
return false;
and do future improvements when adding support for MSR RAPL.

> > +static void __rapl_pmu_event_start(struct perf_event *event)
> > +{
> > +       struct rapl_package_pmu_data *data =
> > event_to_pmu_data(event);
> > +
> > +       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> > +               return;
> > +
> > +       event->hw.state = 0;
> > +
> > +       list_add_tail(&event->active_entry, &data->active_list);
> > +
> > +       local64_set(&event->hw.prev_count,
> > event_read_counter(event));
> > +       if (++data->n_active == 1)
> > +               hrtimer_start(&data->hrtimer, data->timer_interval,
> > +                             HRTIMER_MODE_REL_PINNED);
> > +}
> > +
> > +static void rapl_pmu_event_start(struct perf_event *event, int
> > mode)
> > +{
> > +       struct rapl_package_pmu_data *data =
> > event_to_pmu_data(event);
> > +       unsigned long flags;
> > +
> > +       raw_spin_lock_irqsave(&data->lock, flags);
> > +       __rapl_pmu_event_start(event);
> > +       raw_spin_unlock_irqrestore(&data->lock, flags);
>
> Why does it need to be raw_spin_lock_?
>
> What exactly is protected by data->lock?
>
This is copied from MSR RAPL PMU, which exists from day 1 of the code.

Let me double check.

>
> > +
> > +static ssize_t cpumask_show(struct device *dev,
> > +                           struct device_attribute *attr, char
> > *buf)
> > +{
> > +       struct rapl_package *rp;
> > +       int cpu;
> > +
> > +       cpus_read_lock();
>
> Is rapl_packages protected by this?

yes.
>
> > +       cpumask_clear(&rapl_pmu.cpu_mask);
>
> It doesn't look like rapl_pmu.cpu_mask is used outside this function,
> so why is it global?

Good catch, will fix it.
>
> > +static int rapl_pmu_update(struct rapl_package *rp)
> > +{
> > +       int ret;
> > +
> > +       /* Return if PMU already covers all events supported by
> > current RAPL Package */
> > +       if (rapl_pmu.registered && !(rp->domain_map &
> > (~rapl_pmu.domain_map)))
> > +               return 0;
> > +
> > +       /* Unregister previous registered PMU */
> > +       if (rapl_pmu.registered) {
> > +               perf_pmu_unregister(&rapl_pmu.pmu);
> > +               memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> > +       }
> > +
> > +       rapl_pmu.domain_map |= rp->domain_map;
> > +
> > +       memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> > +       rapl_pmu.pmu.attr_groups = pmu_attr_groups;
> > +       rapl_pmu.pmu.attr_update = pmu_attr_update;
> > +       rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
> > +       rapl_pmu.pmu.event_init = rapl_pmu_event_init;
> > +       rapl_pmu.pmu.add = rapl_pmu_event_add;
> > +       rapl_pmu.pmu.del = rapl_pmu_event_del;
> > +       rapl_pmu.pmu.start = rapl_pmu_event_start;
> > +       rapl_pmu.pmu.stop = rapl_pmu_event_stop;
> > +       rapl_pmu.pmu.read = rapl_pmu_event_read;
> > +       rapl_pmu.pmu.module = THIS_MODULE;
> > +       rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE |
> > PERF_PMU_CAP_NO_INTERRUPT;
> > +       ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
> > +       if (ret)
> > +               pr_warn("Failed to register PMU\n");
> > +
> > +       rapl_pmu.registered = !ret;
>
> Why don't you set rp->has_pmu here?
>
> > +
> > +       return ret;
>
> It looks like this could be rearranged overall for more clarity:
>
> ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
> if (ret) {
>         pr_warn("Failed to register PMU\n");
>         return ret;
> }
>
> rapl_pmu.registered = true;
> rp->has_pmu = true;
>
> return 0;
>
Sure.

In my previous design,
rapl_pmu_update() updates generic RAPL PMU.
rapl_package_add_pmu() updates a given RAPL package.
that is why I put
rp->has_pmu = true;
in rapl_package_add_pmu().

> Also, the "Failed to register PMU\n" message is not particularly
> useful AFAICS.  It would be good to add some more context to it and
> maybe make it pr_info()?
>
sure.

> > +}
> > +
> > +int rapl_package_add_pmu(struct rapl_package *rp)
> > +{
> > +       struct rapl_package_pmu_data *data = &rp->pmu_data;
> > +       int idx;
> > +       int ret;
> > +
> > +       if (rp->has_pmu)
> > +               return -EEXIST;
> > +
> > +       guard(cpus_read_lock)();
>
> Why does this lock need to be held around the entire code below?
>

This guaranteed that the RAPL Package is always valid and rapl_pmu
global variable is protected when updating the PMU.

> > +
> > +       for (idx = 0; idx < rp->nr_domains; idx++) {
> > +               struct rapl_domain *rd = &rp->domains[idx];
> > +               int domain = rd->id;
> > +               u64 val;
> > +
> > +               if (!test_bit(domain, &rp->domain_map))
> > +                       continue;
> > +
> > +               /*
> > +                * The RAPL PMU granularity is 2^-32 Joules
> > +                * data->scale[]: times of 2^-32 Joules for each
> > ENERGY COUNTER increase
> > +                */
> > +               val = rd->energy_unit * (1ULL << 32);
> > +               do_div(val, ENERGY_UNIT_SCALE * 1000000);
> > +               data->scale[domain] = val;
> > +
> > +               if (!rapl_pmu.timer_ms) {
> > +                       struct rapl_primitive_info *rpi =
> > get_rpi(rp, ENERGY_COUNTER);
> > +
> > +                       /*
> > +                        * Calculate the timer rate:
> > +                        * Use reference of 200W for scaling the
> > timeout to avoid counter
> > +                        * overflows.
> > +                        *
> > +                        * max_count = rpi->mask >> rpi->shift + 1
> > +                        * max_energy_pj = max_count * rd-
> > >energy_unit
> > +                        * max_time_sec = (max_energy_pj /
> > 1000000000) / 200w
> > +                        *
> > +                        * rapl_pmu.timer_ms = max_time_sec * 1000
> > / 2
> > +                        */
> > +                       val = (rpi->mask >> rpi->shift) + 1;
> > +                       val *= rd->energy_unit;
> > +                       do_div(val, 1000000 * 200 * 2);
> > +                       rapl_pmu.timer_ms = val;
> > +
> > +                       pr_info("%llu ms ovfl timer\n",
> > rapl_pmu.timer_ms);
>
> s/ovfl/overflow/
>
> And use pr_debug()?
>
> > +               }
> > +
> > +               pr_info("Domain %s: hw unit %lld * 2^-32 Joules\n",
> > rd->name, data->scale[domain]);
>
> pr_debug() here too?

These all follow the MSR RAPL PMU code, so that we see the same output
no matter using MSR RAPL or TPMI RAPL. I can change them to pr_debug().

Thanks,
rui