Re: [PATCH v3 3/4] perf: xgene: Add APM X-Gene SoC Performance Monitoring Unit driver

From: Mark Rutland
Date: Thu Jun 16 2016 - 11:40:18 EST


Hi,

On Thu, Jun 09, 2016 at 06:24:51PM -0700, Tai Nguyen wrote:
> +#include <linux/acpi.h>
> +#include <linux/efi.h>
> +#include <linux/module.h>
> +#include <linux/io.h>
> +#include <linux/clk.h>
> +#include <linux/of_platform.h>
> +#include <linux/of_address.h>
> +#include <linux/of_fdt.h>
> +#include <linux/of_irq.h>
> +#include <linux/interrupt.h>
> +#include <linux/platform_device.h>
> +#include <linux/perf_event.h>
> +#include <linux/module.h>
> +#include <linux/cpumask.h>
> +#include <linux/slab.h>
> +#include <linux/mfd/syscon.h>
> +#include <linux/regmap.h>

Minor nit: please sort these alphabetically (it makes scanning them
by-eye easier and can help when there are conflicts).

[...]

> +static struct dev_ext_attribute l3c_pmu_format_attrs[] = {
> + PMU_FORMAT_EXT_ATTR(l3c_eventid, "config:0-7"),
> + PMU_FORMAT_EXT_ATTR(l3c_agentid, "config1:0-9"),
> +};
> +
> +static struct dev_ext_attribute iob_pmu_format_attrs[] = {
> + PMU_FORMAT_EXT_ATTR(iob_eventid, "config:0-7"),
> + PMU_FORMAT_EXT_ATTR(iob_agentid, "config1:0-63"),
> +};
> +
> +static struct dev_ext_attribute mcb_pmu_format_attrs[] = {
> + PMU_FORMAT_EXT_ATTR(mcb_eventid, "config:0-5"),
> + PMU_FORMAT_EXT_ATTR(mcb_agentid, "config1:0-9"),
> +};
> +
> +static struct dev_ext_attribute mc_pmu_format_attrs[] = {
> + PMU_FORMAT_EXT_ATTR(mc_eventid, "config:0-28"),
> +};

Please make these structures const.

> +static struct attribute_group pmu_format_attr_group = {
> + .name = "format",
> + .attrs = NULL, /* Filled in xgene_pmu_alloc_attrs */
> +};

I think you should just have a (static const) format_attr_group for each
class, which you reuse and share across instances, rather than bothering
with dynamic allocation (which I think is broken -- more on that below).

To save some pain, I think you can use a compound literal to initialise
each of those in one go:

static const struct attribute_group l3c_pmu_format_attr_group = {
.name = "format",
.attrs = (const dev_ext_attribute[]) {
PMU_FORMAT_EXT_ATTR(l3c_eventid, "config:0-7"),
PMU_FORMAT_EXT_ATTR(l3c_agentid, "config1:0-9"),
},
};

You'll need a full pmu attr group for each class also, but you don't
lose much by doing so.

> +
> +/*
> + * sysfs event attributes
> + */
> +static ssize_t _xgene_pmu_event_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct dev_ext_attribute *eattr;
> +
> + eattr = container_of(attr, struct dev_ext_attribute, attr);
> + return sprintf(buf, "config=0x%lx\n", (unsigned long)eattr->var);
> +}
> +
> +#define PMU_EVENT_EXT_ATTR(_name, _config) \
> + PMU_EXT_ATTR(_name, _xgene_pmu_event_show, (unsigned long)_config)
> +
> +static struct dev_ext_attribute l3c_pmu_events_attrs[] = {
> + PMU_EVENT_EXT_ATTR(cycle-count, 0x00),
> + PMU_EVENT_EXT_ATTR(cycle-count-div-64, 0x01),
> + PMU_EVENT_EXT_ATTR(read-hit, 0x02),
> + PMU_EVENT_EXT_ATTR(read-miss, 0x03),
> + PMU_EVENT_EXT_ATTR(write-need-replacement, 0x06),
> + PMU_EVENT_EXT_ATTR(write-not-need-replacement, 0x07),
> + PMU_EVENT_EXT_ATTR(tq-full, 0x08),
> + PMU_EVENT_EXT_ATTR(ackq-full, 0x09),
> + PMU_EVENT_EXT_ATTR(wdb-full, 0x0a),
> + PMU_EVENT_EXT_ATTR(bank-fifo-full, 0x0b),
> + PMU_EVENT_EXT_ATTR(odb-full, 0x0c),
> + PMU_EVENT_EXT_ATTR(wbq-full, 0x0d),
> + PMU_EVENT_EXT_ATTR(bank-conflict-fifo-issue, 0x0e),
> + PMU_EVENT_EXT_ATTR(bank-fifo-issue, 0x0f),
> +};
> +
> +static struct dev_ext_attribute iob_pmu_events_attrs[] = {
> + PMU_EVENT_EXT_ATTR(cycle-count, 0x00),
> + PMU_EVENT_EXT_ATTR(cycle-count-div-64, 0x01),
> + PMU_EVENT_EXT_ATTR(axi0-read, 0x02),
> + PMU_EVENT_EXT_ATTR(axi0-read-partial, 0x03),
> + PMU_EVENT_EXT_ATTR(axi1-read, 0x04),
> + PMU_EVENT_EXT_ATTR(axi1-read-partial, 0x05),
> + PMU_EVENT_EXT_ATTR(csw-read-block, 0x06),
> + PMU_EVENT_EXT_ATTR(csw-read-partial, 0x07),
> + PMU_EVENT_EXT_ATTR(axi0-write, 0x10),
> + PMU_EVENT_EXT_ATTR(axi0-write-partial, 0x11),
> + PMU_EVENT_EXT_ATTR(axi1-write, 0x13),
> + PMU_EVENT_EXT_ATTR(axi1-write-partial, 0x14),
> + PMU_EVENT_EXT_ATTR(csw-inbound-dirty, 0x16),
> +};
> +
> +static struct dev_ext_attribute mcb_pmu_events_attrs[] = {
> + PMU_EVENT_EXT_ATTR(cycle-count, 0x00),
> + PMU_EVENT_EXT_ATTR(cycle-count-div-64, 0x01),
> + PMU_EVENT_EXT_ATTR(csw-read, 0x02),
> + PMU_EVENT_EXT_ATTR(csw-write-request, 0x03),
> + PMU_EVENT_EXT_ATTR(mcb-csw-stall, 0x04),
> + PMU_EVENT_EXT_ATTR(cancel-read-gack, 0x05),
> +};
> +
> +static struct dev_ext_attribute mc_pmu_events_attrs[] = {
> + PMU_EVENT_EXT_ATTR(cycle-count, 0x00),
> + PMU_EVENT_EXT_ATTR(cycle-count-div-64, 0x01),
> + PMU_EVENT_EXT_ATTR(act-cmd-sent, 0x02),
> + PMU_EVENT_EXT_ATTR(pre-cmd-sent, 0x03),
> + PMU_EVENT_EXT_ATTR(rd-cmd-sent, 0x04),
> + PMU_EVENT_EXT_ATTR(rda-cmd-sent, 0x05),
> + PMU_EVENT_EXT_ATTR(wr-cmd-sent, 0x06),
> + PMU_EVENT_EXT_ATTR(wra-cmd-sent, 0x07),
> + PMU_EVENT_EXT_ATTR(pde-cmd-sent, 0x08),
> + PMU_EVENT_EXT_ATTR(sre-cmd-sent, 0x09),
> + PMU_EVENT_EXT_ATTR(prea-cmd-sent, 0x0a),
> + PMU_EVENT_EXT_ATTR(ref-cmd-sent, 0x0b),
> + PMU_EVENT_EXT_ATTR(rd-rda-cmd-sent, 0x0c),
> + PMU_EVENT_EXT_ATTR(wr-wra-cmd-sent, 0x0d),
> + PMU_EVENT_EXT_ATTR(in-rd-collision, 0x0e),
> + PMU_EVENT_EXT_ATTR(in-wr-collision, 0x0f),
> + PMU_EVENT_EXT_ATTR(collision-queue-not-empty, 0x10),
> + PMU_EVENT_EXT_ATTR(collision-queue-full, 0x11),
> + PMU_EVENT_EXT_ATTR(mcu-request, 0x12),
> + PMU_EVENT_EXT_ATTR(mcu-rd-request, 0x13),
> + PMU_EVENT_EXT_ATTR(mcu-hp-rd-request, 0x14),
> + PMU_EVENT_EXT_ATTR(mcu-wr-request, 0x15),
> + PMU_EVENT_EXT_ATTR(mcu-rd-proceed-all, 0x16),
> + PMU_EVENT_EXT_ATTR(mcu-rd-proceed-cancel, 0x17),
> + PMU_EVENT_EXT_ATTR(mcu-rd-response, 0x18),
> + PMU_EVENT_EXT_ATTR(mcu-rd-proceed-speculative-all, 0x19),
> + PMU_EVENT_EXT_ATTR(mcu-rd-proceed-speculative-cancel, 0x1a),
> + PMU_EVENT_EXT_ATTR(mcu-wr-proceed-all, 0x1b),
> + PMU_EVENT_EXT_ATTR(mcu-wr-proceed-cancel, 0x1c),
> +};
> +
> +static struct attribute_group pmu_events_attr_group = {
> + .name = "events",
> + .attrs = NULL, /* Filled in xgene_pmu_alloc_attrs */
> +};

My comments for the format groups apply here too.

> +
> +/*
> + * sysfs cpumask attributes
> + */
> +static ssize_t xgene_pmu_cpumask_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct xgene_pmu_dev *pmu_dev = to_pmu_dev(dev_get_drvdata(dev));
> +
> + return cpumap_print_to_pagebuf(true, buf, &pmu_dev->parent->cpu);
> +}
> +static DEVICE_ATTR(cpumask, S_IRUGO, xgene_pmu_cpumask_show, NULL);
> +
> +static struct attribute *xgene_pmu_cpumask_attrs[] = {
> + &dev_attr_cpumask.attr,
> + NULL,
> +};
> +
> +static struct attribute_group pmu_cpumask_attr_group = {
> + .attrs = xgene_pmu_cpumask_attrs,
> +};

Please make these const.

> +
> +static const struct attribute_group *pmu_attr_groups[] = {
> + &pmu_format_attr_group,
> + &pmu_cpumask_attr_group,
> + &pmu_events_attr_group,
> + NULL
> +};

As mentioned earlier, you'll need one of these per class.

[...]

> +static int xgene_perf_event_init(struct perf_event *event)
> +{
> + struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u64 config, config1;
> +
> + /* Test the event attr type check for PMU enumeration */
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + /*
> + * SOC PMU counters are shared across all cores.
> + * Therefore, it does not support per-process mode.
> + * Also, it does not support event sampling mode.
> + */
> + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
> + return -EINVAL;
> +
> + /* SOC counters do not have usr/os/guest/host bits */
> + if (event->attr.exclude_user || event->attr.exclude_kernel ||
> + event->attr.exclude_host || event->attr.exclude_guest)
> + return -EINVAL;
> +
> + if (event->cpu < 0)
> + return -EINVAL;
> + /*
> + * Many perf core operations (eg. events rotation) operate on a
> + * single CPU context. This is obvious for CPU PMUs, where one
> + * expects the same sets of events being observed on all CPUs,
> + * but can lead to issues for off-core PMUs, where each
> + * event could be theoretically assigned to a different CPU. To
> + * mitigate this, we enforce CPU assignment to one, selected
> + * processor (the one described in the "cpumask" attribute).
> + */
> + event->cpu = cpumask_first(&pmu_dev->parent->cpu);

I didn't spot a perf_pmu_migrate_context call anywhere.

Do you not plan to handle hotplug?

> +
> + config = event->attr.config;
> + config1 = event->attr.config1;
> +
> + hwc->config = config;
> + /*
> + * Each bit of the config1 field represents an agent from which the
> + * request of the event come. The event is counted only if it's caused
> + * by a request of an agent has the bit set.
> + * By default, the event is counted for all agents.
> + */
> + if (config1)
> + hwc->extra_reg.config = config1;
> + else
> + hwc->extra_reg.config = 0xFFFFFFFFFFFFFFFFULL;
> +

If you treated the bits as having the opposite meaning (i.e. a bit set
means ignore an agent), then the zero default would not have to be
special-cased here.

Currently 0 and 0xFFFFFFFFFFFFFFFFULL mean the same thing, which is a
little odd.

You also need to verify the event grouping, e.g. avoiding cross-pmu
groups. See what we do in the arm-ccn driver.

[...]

> + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, count)
> + != prev_raw_count)
> + return;

Please leave the '!=' dangling on the first line. It makes it clearer
that there is more to the expression.

[...]

> +static struct attribute **alloc_attrs(struct device *dev, u32 n,
> + struct dev_ext_attribute *dev_attr)
> +{
> + struct attribute **attrs;
> + int i;
> +
> + /* Alloc n + 1 (for terminating NULL) */
> + attrs = devm_kcalloc(dev, n + 1, sizeof(struct attribute *),
> + GFP_KERNEL);
> + if (!attrs)
> + return attrs;
> +
> + for (i = 0; i < n; i++)
> + attrs[i] = &dev_attr[i].attr.attr;
> + return attrs;
> +}
> +
> +static int
> +xgene_pmu_alloc_attrs(struct device *dev, struct xgene_pmu_dev *pmu_dev)
> +{
> + struct attribute **attrs;
> +
> + if (pmu_dev->nformat_attrs) {
> + attrs = alloc_attrs(dev, pmu_dev->nformat_attrs,
> + pmu_dev->format_attr);
> + if (!attrs)
> + return -ENOMEM;
> + pmu_format_attr_group.attrs = attrs;
> + }
> +
> + if (pmu_dev->nevents_attrs) {
> + attrs = alloc_attrs(dev, pmu_dev->nevents_attrs,
> + pmu_dev->events_attr);
> + if (!attrs)
> + return -ENOMEM;
> + pmu_events_attr_group.attrs = attrs;
> + }
> +
> + pmu_dev->attr_groups = pmu_attr_groups;
> +
> + return 0;
> +}

I don't see why we need to dynamically allocate anything, given we make
a full copy of each of the existing arrays without modification. We
should just be able to use them as-is.

As I mentioned above, please just statically allocate the structures you
need and share them.

[...]

> +static int
> +xgene_pmu_dev_add(struct xgene_pmu *xgene_pmu, struct xgene_pmu_dev_ctx *ctx)
> +{
> + struct device *dev = xgene_pmu->dev;
> + struct xgene_pmu_dev *pmu;
> + int rc;
> +
> + pmu = devm_kzalloc(dev, sizeof(*pmu), GFP_KERNEL);
> + if (!pmu)
> + return -ENOMEM;
> + pmu->parent = xgene_pmu;
> + pmu->inf = &ctx->inf;
> + ctx->pmu_dev = pmu;
> +
> + switch (pmu->inf->type) {
> + case PMU_TYPE_L3C:
> + pmu->format_attr = l3c_pmu_format_attrs;
> + pmu->nformat_attrs = ARRAY_SIZE(l3c_pmu_format_attrs);
> + pmu->events_attr = l3c_pmu_events_attrs;
> + pmu->nevents_attrs = ARRAY_SIZE(l3c_pmu_events_attrs);
> + break;
> + case PMU_TYPE_IOB:
> + pmu->format_attr = iob_pmu_format_attrs;
> + pmu->nformat_attrs = ARRAY_SIZE(iob_pmu_format_attrs);
> + pmu->events_attr = iob_pmu_events_attrs;
> + pmu->nevents_attrs = ARRAY_SIZE(iob_pmu_events_attrs);
> + break;
> + case PMU_TYPE_MCB:
> + if (!(xgene_pmu->mcb_active_mask & pmu->inf->enable_mask))
> + goto dev_err;
> + pmu->format_attr = mcb_pmu_format_attrs;
> + pmu->nformat_attrs = ARRAY_SIZE(mcb_pmu_format_attrs);
> + pmu->events_attr = mcb_pmu_events_attrs;
> + pmu->nevents_attrs = ARRAY_SIZE(mcb_pmu_events_attrs);
> + break;
> + case PMU_TYPE_MC:
> + if (!(xgene_pmu->mc_active_mask & pmu->inf->enable_mask))
> + goto dev_err;
> + pmu->format_attr = mc_pmu_format_attrs;
> + pmu->nformat_attrs = ARRAY_SIZE(mc_pmu_format_attrs);
> + pmu->events_attr = mc_pmu_events_attrs;
> + pmu->nevents_attrs = ARRAY_SIZE(mc_pmu_events_attrs);
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + rc = xgene_pmu_alloc_attrs(dev, pmu);
> + if (rc) {
> + dev_err(dev, "%s PMU: Failed to alloc attributes\n", ctx->name);
> + goto dev_err;
> + }
> +
> + rc = xgene_init_perf(pmu, ctx->name);
> + if (rc) {
> + dev_err(dev, "%s PMU: Failed to init perf driver\n", ctx->name);
> + goto dev_err;
> + }
> +
> + dev_info(dev, "%s PMU registered\n", ctx->name);
> +
> + /* All attribute allocations can be free'd after perf_register_pmu */
> + deallocate_attrs(dev, pmu);

As far as I can see, that is not true. The core perf code doesn't make a
copy of the attrs, it just continues to use them as-is. So this looks to
be broken and very unsafe.

Please get rid of the dynamic allocation and copying.

[...]

> +static irqreturn_t _xgene_pmu_isr(int irq, struct xgene_pmu_dev *pmu_dev)
> +{
> + struct perf_event *event = NULL;
> + struct perf_sample_data data;

You never sample, so don't need perf_sample_data.

> + struct xgene_pmu *xgene_pmu;
> + struct hw_perf_event *hwc;
> + int idx;
> + u32 val;
> +
> + /* Get interrupt counter source */
> + val = readl(pmu_dev->inf->csr + PMU_PMOVSR);
> + idx = ffs(val) - 1;

bit 0 is never set?

What if multiple counters overflow? You should iterate over all the
counters which have overflown (as we do in the arm-ccn driver).

> + if (!(val & PMU_OVERFLOW_MASK))
> + goto out;
> + event = pmu_dev->pmu_counter_event[idx];
> +
> + /* Ignore if we don't have an event. */
> + if (!event)
> + goto out;
> +
> + hwc = &event->hw;
> +
> + xgene_perf_event_update(event, hwc, idx);
> + perf_sample_data_init(&data, 0, hwc->last_period);

You aren't sampling, so don't need this.

> + if (!xgene_perf_event_set_period(event, hwc, idx))
> + goto out;
> +
> +out:
> + /* Clear interrupt flag */
> + xgene_pmu = pmu_dev->parent;
> + if (xgene_pmu->version == 1)


It would be better to consistently use the PCP_PMU_V{1,2} defines.

[...]

> + version = (dev_id == PCP_PMU_V1) ? 1 : 2;

As above, why not set version to the dev_id, and use the PCP_PMU_V*
defines consistently when comparing xgene_pmu::version?

> + irq = platform_get_irq(pdev, 0);
> + if (irq < 0) {
> + dev_err(&pdev->dev, "No IRQ resource\n");
> + rc = -EINVAL;
> + goto err;
> + }
> + rc = devm_request_irq(&pdev->dev, irq, xgene_pmu_isr, IRQF_SHARED,
> + dev_name(&pdev->dev), xgene_pmu);
> + if (rc) {
> + dev_err(&pdev->dev, "Could not request IRQ %d\n", irq);
> + goto err;
> + }

> + /* Pick one core to use for cpumask attributes */
> + cpumask_set_cpu(smp_processor_id(), &xgene_pmu->cpu);

You also need to ensure that the IRQ occurs on this CPU.

Thanks,
Mark.