Re: [PATCH v7 4/5] perf: CXL Performance Monitoring Unit driver

From: Namhyung Kim
Date: Sat May 27 2023 - 02:26:44 EST


On Fri, May 26, 2023 at 3:00 AM Jonathan Cameron
<Jonathan.Cameron@xxxxxxxxxx> wrote:
>
> CXL rev 3.0 introduces a standard performance monitoring hardware
> block to CXL. Instances are discovered using CXL Register Locator DVSEC
> entries. Each CXL component may have multiple PMUs.
>
> This initial driver supports a subset of types of counter.
> It supports counters that are either fixed or configurable, but requires
> that they support the ability to freeze and write value whilst frozen.
>
> Development done with QEMU model which will be posted shortly.
>
> Example:
>
> $ perf stat -a -e cxl_pmu_mem0.0/h2d_req_snpcur/ -e cxl_pmu_mem0.0/h2d_req_snpdata/ -e cxl_pmu_mem0.0/clock_ticks/ sleep 1
>
> Performance counter stats for 'system wide':
>
> 96,757,023,244,321 cxl_pmu_mem0.0/h2d_req_snpcur/
> 96,757,023,244,365 cxl_pmu_mem0.0/h2d_req_snpdata/
> 193,514,046,488,653 cxl_pmu_mem0.0/clock_ticks/
>
> 1.090539600 seconds time elapsed
>
> Reviewed-by: Dave Jiang <dave.jiang@xxxxxxxxx>
> Reviewed-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx>
> ---
> v7: THanks to Namhyung and Stephane for reviews
> - Set the cpu for an event only after validation checks.
> - Added -a to make perf default of all CPUs explicit in command line.
> - Fix wrong event names in example.
> ---

[SNIP]
> +static void cxl_pmu_event_start(struct perf_event *event, int flags)
> +{
> + struct cxl_pmu_info *info = pmu_to_cxl_pmu_info(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + void __iomem *base = info->base;
> + u64 cfg;
> +
> + /*
> + * All paths to here should either set these flags directly or
> + * call cxl_pmu_event_stop() which will ensure the correct state.
> + */
> + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
> + return;
> +
> + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
> + hwc->state = 0;
> +
> + /*
> + * Currently only hdm filter control is implemnted, this code will

Typo: implemented

> + * want generalizing when more filters are added.
> + */
> + if (info->filter_hdm) {
> + if (cxl_pmu_config1_hdm_filter_en(event))
> + cfg = cxl_pmu_config2_get_hdm_decoder(event);
> + else
> + cfg = GENMASK(15, 0); /* No filtering if 0xFFFF_FFFF */
> + writeq(cfg, base + CXL_PMU_FILTER_CFG_REG(hwc->idx, 0));
> + }
> +
> + cfg = readq(base + CXL_PMU_COUNTER_CFG_REG(hwc->idx));
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_INT_ON_OVRFLW, 1);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_FREEZE_ON_OVRFLW, 1);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_ENABLE, 1);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_EDGE,
> + cxl_pmu_config1_get_edge(event) ? 1 : 0);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_INVERT,
> + cxl_pmu_config1_get_invert(event) ? 1 : 0);
> +
> + /* Fixed purpose counters have next two fields RO */
> + if (test_bit(hwc->idx, info->conf_counter_bm)) {
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_EVENT_GRP_ID_IDX_MSK,
> + hwc->event_base);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_EVENTS_MSK,
> + cxl_pmu_config_get_mask(event));
> + }
> + cfg &= ~CXL_PMU_COUNTER_CFG_THRESHOLD_MSK;
> + /*
> + * For events that generate only 1 count per clock the CXL 3.0 spec
> + * states the threshold shall be set to 1 but if set to 0 it will
> + * count the raw value anwyay?

Typo: anyway?

> + * There is no definition of what events will count multiple per cycle
> + * and hence to which non 1 values of threshold can apply.
> + * (CXL 3.0 8.2.7.2.1 Counter Configuration - threshold field definition)
> + */
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_THRESHOLD_MSK,
> + cxl_pmu_config1_get_threshold(event));
> + writeq(cfg, base + CXL_PMU_COUNTER_CFG_REG(hwc->idx));
> +
> + local64_set(&hwc->prev_count, 0);
> + writeq(0, base + CXL_PMU_COUNTER_REG(hwc->idx));
> +
> + perf_event_update_userpage(event);
> +}
> +

[SNIP]
> +
> +static int cxl_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> + struct cxl_pmu_info *info = hlist_entry_safe(node, struct cxl_pmu_info, node);
> +
> + if (info->on_cpu != -1)
> + return 0;
> +
> + info->on_cpu = cpu;
> + /*
> + * CPU HP lock is held so we should be guaranteed that the CPU hasn't yet
> + * gone away again.
> + */
> + WARN_ON(irq_set_affinity(info->irq, cpumask_of(cpu)));
> +
> + return 0;
> +}
> +
> +static int cxl_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> + struct cxl_pmu_info *info = hlist_entry_safe(node, struct cxl_pmu_info, node);
> + unsigned int target;
> +
> + if (info->on_cpu != cpu)
> + return 0;
> +
> + info->on_cpu = -1;
> + target = cpumask_any_but(cpu_online_mask, cpu);
> + if (target >= nr_cpu_ids) {
> + dev_err(info->pmu.dev, "Unable to find a suitable CPU\n");
> + return 0;
> + }
> +
> + perf_pmu_migrate_context(&info->pmu, cpu, target);
> + info->on_cpu = target;
> + /*
> + * CPU HP lock is held so we should be guaranteed that this CPU hasn't yet
> + * gone away.
> + */
> + WARN_ON(irq_set_affinity(info->irq, cpumask_of(target)));
> +
> + return 0;
> +}

IIUC a CXL PMU hardware (say cxl_pmu_mem0.0) is shared across
all CPUs and it would return the same value when read from any CPU,
right?

Thanks,
Namhyung


> +
> +static __init int cxl_pmu_init(void)
> +{
> + int rc;
> +
> + rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> + "AP_PERF_CXL_PMU_ONLINE",
> + cxl_pmu_online_cpu, cxl_pmu_offline_cpu);
> + if (rc < 0)
> + return rc;
> + cxl_pmu_cpuhp_state_num = rc;
> +
> + rc = cxl_driver_register(&cxl_pmu_driver);
> + if (rc)
> + cpuhp_remove_multi_state(cxl_pmu_cpuhp_state_num);
> +
> + return rc;
> +}
> +
> +static __exit void cxl_pmu_exit(void)
> +{
> + cxl_driver_unregister(&cxl_pmu_driver);
> + cpuhp_remove_multi_state(cxl_pmu_cpuhp_state_num);
> +}
> +
> +MODULE_LICENSE("GPL");
> +MODULE_IMPORT_NS(CXL);
> +module_init(cxl_pmu_init);
> +module_exit(cxl_pmu_exit);
> +MODULE_ALIAS_CXL(CXL_DEVICE_PMU);
> --
> 2.39.2
>