Re: [PATCH 1/1] perf/x86: Add Intel power cstate PMUs support

From: Stephane Eranian
Date: Fri Jul 24 2015 - 16:45:47 EST


On Fri, Jul 24, 2015 at 5:39 AM, <kan.liang@xxxxxxxxx> wrote:
> From: Kan Liang <kan.liang@xxxxxxxxx>
>
> This patch adds new PMUs to support power cstate related free running
> (read-only) counters. These counters may be used simultaneously by other
> tools, such as turbostat. However, it still make sense to implement them
> in perf. Because we can conveniently collect them together with other
> events, and allow to use them from tools without special MSR access
> code.
>
> These counters include CORE_C*_RESIDENCY and PKG_C*_RESIDENCY.
> According to counters' scope and category, two PMUs are registered with
> the perf_event core subsystem.
> - 'cstate_core': The counter is available for each processor core. The
> counters include CORE_C*_RESIDENCY.
> - 'cstate_pkg': The counter is available for each physical package. The
> counters include PKG_C*_RESIDENCY.
>
> The events are exposed in sysfs for use by perf stat and other tools.
> The files are:
> /sys/devices/cstate_core/events/c*-residency
> /sys/devices/cstate_pkg/events/c*-residency
>
> These events only support system-wide mode counting.
> The /sys/devices/cstate_*/cpumask file can be used by tools to figure
> out which CPUs to monitor by default.
>
> The PMU type (attr->type) is dynamically allocated and is available from
> /sys/devices/core_misc/type and /sys/device/cstate_*/type.
>
> Sampling is not supported.
>
> Here is an example.
>
> - To caculate the fraction of time when the core is running in C6 state
> CORE_C6_time% = CORE_C6_RESIDENCY / TSC
>
> $ perf stat -x, -e"cstate_core/c6-residency/,msr/tsc/" -C0
> -- taskset -c 0 sleep 5
> 11453299029,,cstate_core/c6-residency/,4998723500,100.00
> 11476922553,,msr/tsc/,4998738042,100.00
> For sleep, 99.8% of time run in C6 state.
>
> $ perf stat -x, -e"cstate_core/c6-residency/,msr/tsc/" -C0
> -- taskset -c 0 busyloop
> 1253316,,cstate_core/c6-residency/,4360969154,100.00
> 10012635248,,msr/tsc/,4360972366,100.00
> For busyloop, 0.01% of time run in C6 state.
>
> Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/Makefile | 1 +
> arch/x86/kernel/cpu/perf_event_intel_cstate.c | 753 ++++++++++++++++++++++++++
> 2 files changed, 754 insertions(+)
> create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cstate.c
>
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 9bff687..4cdb270 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -41,6 +41,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
> obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
> obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
> obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
> +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o
>
> obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
> perf_event_intel_uncore_snb.o \
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
> new file mode 100644
> index 0000000..8b397a6
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
> @@ -0,0 +1,753 @@
> +/*
> + * perf_event_intel_power_cstate.c: support power cstate counters
> + *
> + * Copyright (C) 2015, Intel Corp.
> + * Author: Kan Liang (kan.liang@xxxxxxxxx)
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Library General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Library General Public License for more details.
> + *
> + */
> +
> +/*
> + * This file export cstate related free running (read-only) counters
> + * for perf. These counters may be use simultaneously by other tools,
> + * such as turbostat. However, it still make sense to implement them
> + * in perf. Because we can conveniently collect them together with
> + * other events, and allow to use them from tools without special MSR
> + * access code.
> + *
> + * The events only support system-wide mode counting. There is no
> + * sampling support because it is not supported by the hardware.
> + *
> + * All of these counters are specified in the Intel 64 and IA-32
> + * Architectures Software Developer.s Manual Vol3b.
> + *
> + * Model specific counters:
> + * MSR_CORE_C1_RES: CORE C1 Residency Counter
> + * perf code: 0x00
> + * Available model: SLM,AMT
> + * Scope: Core (each processor core has a MSR)
> + * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
> + * perf code: 0x01
> + * Available model: NHM,WSM,SNB,IVB,HSW,BDW
> + * Scope: Core
> + * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
> + * perf code: 0x02
> + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW
> + * Scope: Core
> + * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
> + * perf code: 0x03
> + * Available model: SNB,IVB,HSW,BDW
> + * Scope: Core
> + * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
> + * perf code: 0x04
> + * Available model: SNB,IVB,HSW,BDW
> + * Scope: Package (physical package)
> + * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
> + * perf code: 0x05
> + * Available model: NHM,WSM,SNB,IVB,HSW,BDW
> + * Scope: Package (physical package)
> + * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
> + * perf code: 0x06
> + * Available model: NHM,WSM,SNB,IVB,HSW,BDW
> + * Scope: Package (physical package)
> + * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
> + * perf code: 0x07
> + * Available model: NHM,WSM,SNB,IVB,HSW,BDW
> + * Scope: Package (physical package)
> + * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
> + * perf code: 0x08
> + * Available model: HSW ULT only
> + * Scope: Package (physical package)
> + * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
> + * perf code: 0x09
> + * Available model: HSW ULT only
> + * Scope: Package (physical package)
> + * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
> + * perf code: 0x0a
> + * Available model: HSW ULT only
> + * Scope: Package (physical package)
> + * MSR_SLM_PKG_C6_RESIDENCY: Package C6 Residency Counter for SLM.
> + * perf code: 0x0b
> + * Available model: SLM,AMT
> + * Scope: Package (physical package)
> + *
> + */
> +
Why would the user (and the tools) have to change the event code to
measure, let's
say C6 residency, between SLM (0xb) and and BDW (0x6)? Are they not measuring
the same thing?


> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/perf_event.h>
> +#include <asm/cpu_device_id.h>
> +#include "perf_event.h"
> +
> +struct intel_power_cstate_type {
> + struct pmu pmu;
> + const char *name;
> + int type;
> + const struct attribute_group **pmu_group;
> +};
> +
> +enum perf_intel_power_cstate_type {
> + perf_intel_power_cstate_core = 0,
> + perf_intel_power_cstate_pkg,
> +};
> +
> +struct perf_power_cstate_event_msr {
> + int id;
> + u64 msr;
> +};
> +
> +enum perf_power_cstate_id {
> + /*
> + * power_cstate events, generalized by the kernel:
> + */
> + PERF_POWER_CORE_C1_RES = 0,
> + PERF_POWER_CORE_C3_RES,
> + PERF_POWER_CORE_C6_RES,
> + PERF_POWER_CORE_C7_RES,
> + PERF_POWER_PKG_C2_RES,
> + PERF_POWER_PKG_C3_RES,
> + PERF_POWER_PKG_C6_RES,
> + PERF_POWER_PKG_C7_RES,
> + PERF_POWER_PKG_C8_RES,
> + PERF_POWER_PKG_C9_RES,
> + PERF_POWER_PKG_C10_RES,
> + PERF_POWER_SLM_PKG_C6_RES,
> +
> + PERF_POWER_CSTATE_EVENT_MAX, /* non-ABI */
> +};
> +
> +struct power_cstate_pmu {
> + struct intel_power_cstate_type *power_cstate_type;
> + struct pmu *pmu;
> +};
> +
> +
> +static struct intel_power_cstate_type *empty_power_cstate[] = { NULL, };
> +struct intel_power_cstate_type **power_cstate = empty_power_cstate;
> +
> +static struct perf_power_cstate_event_msr power_cstate_events[] = {
> + { PERF_POWER_CORE_C1_RES, MSR_CORE_C1_RES },
> + { PERF_POWER_CORE_C3_RES, MSR_CORE_C3_RESIDENCY },
> + { PERF_POWER_CORE_C6_RES, MSR_CORE_C6_RESIDENCY },
> + { PERF_POWER_CORE_C7_RES, MSR_CORE_C7_RESIDENCY },
> + { PERF_POWER_PKG_C2_RES, MSR_PKG_C2_RESIDENCY },
> + { PERF_POWER_PKG_C3_RES, MSR_PKG_C3_RESIDENCY },
> + { PERF_POWER_PKG_C6_RES, MSR_PKG_C6_RESIDENCY },
> + { PERF_POWER_PKG_C7_RES, MSR_PKG_C7_RESIDENCY },
> + { PERF_POWER_PKG_C8_RES, MSR_PKG_C8_RESIDENCY },
> + { PERF_POWER_PKG_C9_RES, MSR_PKG_C9_RESIDENCY },
> + { PERF_POWER_PKG_C10_RES, MSR_PKG_C10_RESIDENCY },
> + { PERF_POWER_SLM_PKG_C6_RES, MSR_PKG_C7_RESIDENCY },
> +};
> +
> +EVENT_ATTR_STR(c1-residency, power_core_c1_res, "event=0x00");
> +EVENT_ATTR_STR(c3-residency, power_core_c3_res, "event=0x01");
> +EVENT_ATTR_STR(c6-residency, power_core_c6_res, "event=0x02");
> +EVENT_ATTR_STR(c7-residency, power_core_c7_res, "event=0x03");
> +EVENT_ATTR_STR(c2-residency, power_pkg_c2_res, "event=0x04");
> +EVENT_ATTR_STR(c3-residency, power_pkg_c3_res, "event=0x05");
> +EVENT_ATTR_STR(c6-residency, power_pkg_c6_res, "event=0x06");
> +EVENT_ATTR_STR(c7-residency, power_pkg_c7_res, "event=0x07");
> +EVENT_ATTR_STR(c8-residency, power_pkg_c8_res, "event=0x08");
> +EVENT_ATTR_STR(c9-residency, power_pkg_c9_res, "event=0x09");
> +EVENT_ATTR_STR(c10-residency, power_pkg_c10_res, "event=0x0a");
> +EVENT_ATTR_STR(c6-residency, power_slm_pkg_c6_res, "event=0x0b");
> +
> +static cpumask_t power_cstate_core_cpu_mask;
> +static cpumask_t power_cstate_pkg_cpu_mask;
> +
> +static DEFINE_PER_CPU(struct power_cstate_pmu *, power_cstate_core_pmu);
> +static DEFINE_PER_CPU(struct power_cstate_pmu *, power_cstate_core_pmu_to_free);
> +static DEFINE_PER_CPU(struct power_cstate_pmu *, power_cstate_pkg_pmu);
> +static DEFINE_PER_CPU(struct power_cstate_pmu *, power_cstate_pkg_pmu_to_free);
> +
> +static int power_cstate_pmu_event_init(struct perf_event *event)
> +{
> + u64 cfg = event->attr.config;
> + int ret = 0;
> +
> + if (event->attr.type != event->pmu->type)
> + return -ENOENT;
> +
> + /*
> + * check event is known (determines counter)
> + */
> + if (cfg >= PERF_POWER_CSTATE_EVENT_MAX)
> + return -EINVAL;
> +
> + /* unsupported modes and filters */
> + if (event->attr.exclude_user ||
> + event->attr.exclude_kernel ||
> + event->attr.exclude_hv ||
> + event->attr.exclude_idle ||
> + event->attr.exclude_host ||
> + event->attr.exclude_guest ||
> + event->attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + /* must be done before validate_group */
> + event->hw.event_base = power_cstate_events[cfg].msr;
> + event->hw.config = cfg;
> + event->hw.idx = -1;
> +
> + return ret;
> +}
> +
> +static inline u64 power_cstate_pmu_read_counter(struct perf_event *event)
> +{
> + u64 val;
> +
> + rdmsrl_safe(event->hw.event_base, &val);
> + return val;
> +}
> +
> +static void power_cstate_pmu_event_update(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + u64 prev_raw_count, new_raw_count;
> +
> +again:
> + prev_raw_count = local64_read(&hwc->prev_count);
> + new_raw_count = power_cstate_pmu_read_counter(event);
> +
> + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
> + new_raw_count) != prev_raw_count)
> + goto again;
> +
> + local64_add(new_raw_count - prev_raw_count, &event->count);
> +}
> +
> +static void power_cstate_pmu_event_start(struct perf_event *event, int mode)
> +{
> + local64_set(&event->hw.prev_count, power_cstate_pmu_read_counter(event));
> +}
> +
> +static void power_cstate_pmu_event_stop(struct perf_event *event, int mode)
> +{
> + power_cstate_pmu_event_update(event);
> +}
> +
> +static void power_cstate_pmu_event_del(struct perf_event *event, int mode)
> +{
> + power_cstate_pmu_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +static int power_cstate_pmu_event_add(struct perf_event *event, int mode)
> +{
> + if (mode & PERF_EF_START)
> + power_cstate_pmu_event_start(event, mode);
> +
> + return 0;
> +}
> +
> +static ssize_t power_cstate_get_attr_cpumask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct pmu *pmu = dev_get_drvdata(dev);
> + struct intel_power_cstate_type *type;
> + int i;
> +
> + for (i = 0; power_cstate[i]; i++) {
> + type = power_cstate[i];
> + if (type->pmu.type == pmu->type) {
> + switch (type->type) {
> + case perf_intel_power_cstate_core:
> + return cpumap_print_to_pagebuf(true, buf, &power_cstate_core_cpu_mask);
> + case perf_intel_power_cstate_pkg:
> + return cpumap_print_to_pagebuf(true, buf, &power_cstate_pkg_cpu_mask);
> + default:
> + return 0;
> + }
> + }
> + }
> +
> + return 0;
> +}
> +
> +static DEVICE_ATTR(cpumask, S_IRUGO, power_cstate_get_attr_cpumask, NULL);
> +
> +static struct attribute *power_cstate_pmu_attrs[] = {
> + &dev_attr_cpumask.attr,
> + NULL,
> +};
> +
> +static struct attribute_group power_cstate_pmu_attr_group = {
> + .attrs = power_cstate_pmu_attrs,
> +};
> +
> +PMU_FORMAT_ATTR(event, "config:0-63");
> +static struct attribute *power_cstate_formats_attr[] = {
> + &format_attr_event.attr,
> + NULL,
> +};
> +
> +static struct attribute_group power_cstate_pmu_format_group = {
> + .name = "format",
> + .attrs = power_cstate_formats_attr,
> +};
> +
> +static struct attribute *nhm_power_core_events_attr[] = {
> + EVENT_PTR(power_core_c3_res),
> + EVENT_PTR(power_core_c6_res),
> + NULL,
> +};
> +
> +static struct attribute_group nhm_power_core_pmu_events_group = {
> + .name = "events",
> + .attrs = nhm_power_core_events_attr,
> +};
> +
> +const struct attribute_group *nhm_power_core_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &nhm_power_core_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type nhm_power_core = {
> + .name = "cstate_core",
> + .type = perf_intel_power_cstate_pkg,
> + .pmu_group = nhm_power_core_attr_groups,
> +};
> +
> +static struct attribute *nhm_power_pkg_events_attr[] = {
> + EVENT_PTR(power_pkg_c3_res),
> + EVENT_PTR(power_pkg_c6_res),
> + EVENT_PTR(power_pkg_c7_res),
> + NULL,
> +};
> +
> +static struct attribute_group nhm_power_pkg_pmu_events_group = {
> + .name = "events",
> + .attrs = nhm_power_pkg_events_attr,
> +};
> +
> +const struct attribute_group *nhm_power_pkg_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &nhm_power_pkg_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type nhm_power_pkg = {
> + .name = "cstate_pkg",
> + .type = perf_intel_power_cstate_pkg,
> + .pmu_group = nhm_power_pkg_attr_groups,
> +};
> +
> +static struct intel_power_cstate_type *nhm_power_cstate_types[] = {
> + &nhm_power_core,
> + &nhm_power_pkg,
> +};
> +
> +static struct attribute *slm_power_core_events_attr[] = {
> + EVENT_PTR(power_core_c1_res),
> + EVENT_PTR(power_core_c6_res),
> + NULL,
> +};
> +
> +static struct attribute_group slm_power_core_pmu_events_group = {
> + .name = "events",
> + .attrs = slm_power_core_events_attr,
> +};
> +
> +const struct attribute_group *slm_power_core_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &slm_power_core_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type slm_power_core = {
> + .name = "cstate_core",
> + .type = perf_intel_power_cstate_pkg,
> + .pmu_group = slm_power_core_attr_groups,
> +};
> +
> +static struct attribute *slm_power_pkg_events_attr[] = {
> + EVENT_PTR(power_slm_pkg_c6_res),
> + NULL,
> +};
> +
> +static struct attribute_group slm_power_pkg_pmu_events_group = {
> + .name = "events",
> + .attrs = slm_power_pkg_events_attr,
> +};
> +
> +const struct attribute_group *slm_power_pkg_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &slm_power_pkg_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type slm_power_pkg = {
> + .name = "cstate_pkg",
> + .type = perf_intel_power_cstate_pkg,
> + .pmu_group = slm_power_pkg_attr_groups,
> +};
> +
> +static struct intel_power_cstate_type *slm_power_cstate_types[] = {
> + &slm_power_core,
> + &slm_power_pkg,
> +};
> +
> +static struct attribute *snb_power_core_events_attr[] = {
> + EVENT_PTR(power_core_c3_res),
> + EVENT_PTR(power_core_c6_res),
> + EVENT_PTR(power_core_c7_res),
> + NULL,
> +};
> +
> +static struct attribute_group snb_power_core_pmu_events_group = {
> + .name = "events",
> + .attrs = snb_power_core_events_attr,
> +};
> +
> +const struct attribute_group *snb_power_core_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &snb_power_core_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type snb_power_core = {
> + .name = "cstate_core",
> + .type = perf_intel_power_cstate_core,
> + .pmu_group = snb_power_core_attr_groups,
> +};
> +
> +static struct attribute *snb_power_pkg_events_attr[] = {
> + EVENT_PTR(power_pkg_c2_res),
> + EVENT_PTR(power_pkg_c3_res),
> + EVENT_PTR(power_pkg_c6_res),
> + EVENT_PTR(power_pkg_c7_res),
> + NULL,
> +};
> +
> +static struct attribute_group snb_power_pkg_pmu_events_group = {
> + .name = "events",
> + .attrs = snb_power_pkg_events_attr,
> +};
> +
> +const struct attribute_group *snb_power_pkg_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &snb_power_pkg_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type snb_power_pkg = {
> + .name = "cstate_pkg",
> + .type = perf_intel_power_cstate_pkg,
> + .pmu_group = snb_power_pkg_attr_groups,
> +};
> +
> +static struct intel_power_cstate_type *snb_power_cstate_types[] = {
> + &snb_power_core,
> + &snb_power_pkg,
> + NULL,
> +};
> +
> +static struct attribute *hsw_ult_power_pkg_events_attr[] = {
> + EVENT_PTR(power_pkg_c2_res),
> + EVENT_PTR(power_pkg_c3_res),
> + EVENT_PTR(power_pkg_c6_res),
> + EVENT_PTR(power_pkg_c7_res),
> + EVENT_PTR(power_pkg_c8_res),
> + EVENT_PTR(power_pkg_c9_res),
> + EVENT_PTR(power_pkg_c10_res),
> + NULL,
> +};
> +
> +static struct attribute_group hsw_ult_power_pkg_pmu_events_group = {
> + .name = "events",
> + .attrs = hsw_ult_power_pkg_events_attr,
> +};
> +
> +const struct attribute_group *hsw_ult_power_pkg_attr_groups[] = {
> + &power_cstate_pmu_attr_group,
> + &power_cstate_pmu_format_group,
> + &hsw_ult_power_pkg_pmu_events_group,
> + NULL,
> +};
> +
> +static struct intel_power_cstate_type hsw_ult_power_pkg = {
> + .name = "cstate_pkg",
> + .type = perf_intel_power_cstate_pkg,
> + .pmu_group = hsw_ult_power_pkg_attr_groups,
> +};
> +
> +static struct intel_power_cstate_type *hsw_ult_power_cstate_types[] = {
> + &snb_power_core,
> + &hsw_ult_power_pkg,
> +};
> +
> +#define __POWER_CSTATE_CPU_EXIT(_type, _cpu_mask, fn) \
> +{ \
> + pmu = per_cpu(power_cstate_ ## _type, cpu); \
> + if (pmu) { \
> + id = fn(cpu); \
> + target = -1; \
> + for_each_online_cpu(i) { \
> + if (i == cpu) \
> + continue; \
> + if (id == fn(i)) { \
> + target = i; \
> + break; \
> + } \
> + } \
> + if (cpumask_test_and_clear_cpu(cpu, &power_cstate_ ## _cpu_mask) && target >= 0) \
> + cpumask_set_cpu(target, &power_cstate_ ## _cpu_mask); \
> + WARN_ON(cpumask_empty(&power_cstate_ ## _cpu_mask)); \
> + if (target >= 0) \
> + perf_pmu_migrate_context(pmu->pmu, cpu, target); \
> + } \
> +}
> +
> +static void power_cstate_cpu_exit(int cpu)
> +{
> + struct power_cstate_pmu *pmu;
> + int i, id, target;
> +
> + __POWER_CSTATE_CPU_EXIT(core_pmu, core_cpu_mask, topology_core_id);
> + __POWER_CSTATE_CPU_EXIT(pkg_pmu, pkg_cpu_mask, topology_physical_package_id);
> +}
> +
> +#define __POWER_CSTATE_CPU_INIT(_type, _cpu_mask, fn) \
> +{ \
> + pmu = per_cpu(power_cstate_ ## _type, cpu); \
> + if (pmu) { \
> + id = fn(cpu); \
> + for_each_cpu(i, &power_cstate_ ## _cpu_mask) { \
> + if (id == fn(i)) \
> + break; \
> + } \
> + if (i >= nr_cpu_ids) \
> + cpumask_set_cpu(cpu, &power_cstate_ ## _cpu_mask); \
> + } \
> +}
> +
> +static void power_cstate_cpu_init(int cpu)
> +{
> + int i, id;
> + struct power_cstate_pmu *pmu;
> +
> + __POWER_CSTATE_CPU_INIT(core_pmu, core_cpu_mask, topology_core_id);
> + __POWER_CSTATE_CPU_INIT(pkg_pmu, pkg_cpu_mask, topology_physical_package_id);
> +}
> +
> +#define __POWER_CSTATE_CPU_PREPARE(power_cstate_pmu, type) \
> +{ \
> + pmu = per_cpu(power_cstate_pmu, cpu); \
> + if (pmu) \
> + break; \
> + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); \
> + pmu->pmu = &type->pmu; \
> + per_cpu(power_cstate_pmu, cpu) = pmu; \
> +}
> +
> +static int power_cstate_cpu_prepare(int cpu)
> +{
> + struct power_cstate_pmu *pmu;
> + struct intel_power_cstate_type *type;
> + int i;
> +
> + for (i = 0; power_cstate[i]; i++) {
> + type = power_cstate[i];
> +
> + switch (type->type) {
> + case perf_intel_power_cstate_core:
> + __POWER_CSTATE_CPU_PREPARE(power_cstate_core_pmu, type);
> + break;
> + case perf_intel_power_cstate_pkg:
> + __POWER_CSTATE_CPU_PREPARE(power_cstate_pkg_pmu, type);
> + break;
> + }
> + }
> +
> + return 0;
> +}
> +
> +#define __POWER_CSTATE_CPU_KFREE(pmu_to_free) \
> +{ \
> + if (per_cpu(pmu_to_free, cpu)) { \
> + kfree(per_cpu(pmu_to_free, cpu)); \
> + per_cpu(pmu_to_free, cpu) = NULL; \
> + } \
> +}
> +
> +static void power_cstate_cpu_kfree(int cpu)
> +{
> + __POWER_CSTATE_CPU_KFREE(power_cstate_core_pmu_to_free);
> + __POWER_CSTATE_CPU_KFREE(power_cstate_pkg_pmu_to_free);
> +}
> +
> +#define __POWER_CSTATE_CPU_DYING(pmu, pmu_to_free) \
> +{ \
> + if (per_cpu(pmu, cpu)) { \
> + per_cpu(pmu_to_free, cpu) = per_cpu(pmu, cpu); \
> + per_cpu(pmu, cpu) = NULL; \
> + } \
> +}
> +
> +static int power_cstate_cpu_dying(int cpu)
> +{
> + __POWER_CSTATE_CPU_DYING(power_cstate_core_pmu, power_cstate_core_pmu_to_free);
> + __POWER_CSTATE_CPU_DYING(power_cstate_pkg_pmu, power_cstate_pkg_pmu_to_free);
> +
> + return 0;
> +}
> +static int power_cstate_cpu_notifier(struct notifier_block *self,
> + unsigned long action, void *hcpu)
> +{
> + unsigned int cpu = (long)hcpu;
> +
> + switch (action & ~CPU_TASKS_FROZEN) {
> + case CPU_UP_PREPARE:
> + power_cstate_cpu_prepare(cpu);
> + break;
> + case CPU_STARTING:
> + power_cstate_cpu_init(cpu);
> + break;
> + case CPU_UP_CANCELED:
> + case CPU_DYING:
> + power_cstate_cpu_dying(cpu);
> + break;
> + case CPU_ONLINE:
> + case CPU_DEAD:
> + power_cstate_cpu_kfree(cpu);
> + break;
> + case CPU_DOWN_PREPARE:
> + power_cstate_cpu_exit(cpu);
> + break;
> + default:
> + break;
> + }
> +
> + return NOTIFY_OK;
> +}
> +
> +#define POWER_CSTATE_CPU(_model, _ops) { \
> + .vendor = X86_VENDOR_INTEL, \
> + .family = 6, \
> + .model = _model, \
> + .driver_data = (kernel_ulong_t)&_ops, \
> + }
> +
> +static const struct x86_cpu_id power_cstate_ids[] __initconst = {
> + POWER_CSTATE_CPU(0x37, slm_power_cstate_types),/* Silvermont */
> + POWER_CSTATE_CPU(0x4d, slm_power_cstate_types),/* Silvermont Avoton/Rangely */
> + POWER_CSTATE_CPU(0x4c, slm_power_cstate_types),/* Airmont */
> + POWER_CSTATE_CPU(0x1e, nhm_power_cstate_types),/* Nehalem */
> + POWER_CSTATE_CPU(0x1a, nhm_power_cstate_types),/* Nehalem-EP */
> + POWER_CSTATE_CPU(0x2e, nhm_power_cstate_types),/* Nehalem-EX */
> + POWER_CSTATE_CPU(0x25, nhm_power_cstate_types),/* Westmere */
> + POWER_CSTATE_CPU(0x2c, nhm_power_cstate_types),/* Westmere-EP */
> + POWER_CSTATE_CPU(0x2f, nhm_power_cstate_types),/* Westmere-EX */
> + POWER_CSTATE_CPU(0x2a, snb_power_cstate_types),/* SandyBridge */
> + POWER_CSTATE_CPU(0x2d, snb_power_cstate_types),/* SandyBridge-E/EN/EP */
> + POWER_CSTATE_CPU(0x3a, snb_power_cstate_types),/* IvyBridge */
> + POWER_CSTATE_CPU(0x3e, snb_power_cstate_types),/* IvyBridge-EP/EX */
> + POWER_CSTATE_CPU(0x3c, snb_power_cstate_types),/* Haswell Core */
> + POWER_CSTATE_CPU(0x3f, snb_power_cstate_types),/* Haswell Server */
> + POWER_CSTATE_CPU(0x46, snb_power_cstate_types),/* Haswell + GT3e */
> + POWER_CSTATE_CPU(0x45, hsw_ult_power_cstate_types),/* Haswell ULT */
> + POWER_CSTATE_CPU(0x3d, snb_power_cstate_types),/* Broadwell Core-M */
> + POWER_CSTATE_CPU(0x56, snb_power_cstate_types),/* Broadwell Xeon D */
> + POWER_CSTATE_CPU(0x47, snb_power_cstate_types),/* Broadwell + GT3e */
> + POWER_CSTATE_CPU(0x4f, snb_power_cstate_types),/* Broadwell Server */
> + POWER_CSTATE_CPU(0x4e, snb_power_cstate_types),/* Skylake Mobile */
> + POWER_CSTATE_CPU(0x5e, snb_power_cstate_types),/* Skylake Desktop */
> + {}
> +};
> +
> +static int __init power_cstate_init(void)
> +{
> + const struct x86_cpu_id *id;
> +
> + id = x86_match_cpu(power_cstate_ids);
> + if (!id)
> + return -ENODEV;
> +
> + power_cstate = (struct intel_power_cstate_type **)id->driver_data;
> +
> + return 0;
> +}
> +
> +static void __init power_cstate_cpumask_init(void)
> +{
> + int cpu, err;
> +
> + cpu_notifier_register_begin();
> +
> + for_each_online_cpu(cpu) {
> + err = power_cstate_cpu_prepare(cpu);
> + if (err) {
> + pr_info(" CPU prepare failed\n");
> + cpu_notifier_register_done();
> + return;
> + }
> + power_cstate_cpu_init(cpu);
> + }
> +
> + __perf_cpu_notifier(power_cstate_cpu_notifier);
> +
> + cpu_notifier_register_done();
> +}
> +
> +static void __init power_cstate_pmus_register(void)
> +{
> + struct intel_power_cstate_type *type;
> + int i, err;
> +
> + for (i = 0; power_cstate[i]; i++) {
> + type = power_cstate[i];
> +
> + type->pmu = (struct pmu) {
> + .attr_groups = type->pmu_group,
> + .task_ctx_nr = perf_invalid_context,
> + .event_init = power_cstate_pmu_event_init,
> + .add = power_cstate_pmu_event_add, /* must have */
> + .del = power_cstate_pmu_event_del, /* must have */
> + .start = power_cstate_pmu_event_start,
> + .stop = power_cstate_pmu_event_stop,
> + .read = power_cstate_pmu_event_update,
> + .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
> + };
> +
> + err = perf_pmu_register(&type->pmu, type->name, -1);
> + if (WARN_ON(err))
> + pr_info("Failed to register PMU %s error %d\n",
> + type->pmu.name, err);
> + }
> +}
> +
> +static int __init power_cstate_pmu_init(void)
> +{
> + int err;
> +
> + if (cpu_has_hypervisor)
> + return -ENODEV;
> +
> + err = power_cstate_init();
> + if (err)
> + return err;
> +
> + power_cstate_cpumask_init();
> +
> + power_cstate_pmus_register();
> +
> + return 0;
> +}
> +device_initcall(power_cstate_pmu_init);
> --
> 1.8.3.1
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/