[PATCH 23/32] perf/core: introduce PERF_INACTIVE_*_READ_* flags

From: David Carrillo-Cisneros
Date: Fri Apr 29 2016 - 00:49:08 EST


Some offcore and uncore events, such as the new intel_cqm/llc_occupancy,
can be read even if the event is not active in its CPU (or in any CPU).
In those cases, a freshly read value is more recent, (and therefore
preferable) than the last value stored at event sched out.

There are two cases covered in this patch to allow Intel's CQM (and
potentially other per package events) to obtain updated values regardless
of the scheduling event in a particular CPU. Each case is covered by a
new event::pmu_event_flag:
1) PERF_INACTIVE_CPU_READ_PKG: An event attached to a CPU that can
be read in any CPU in its event:cpu's package, even if inactive.
2) PERF_INACTIVE_EV_READ_ANY_CPU: An event that can be read in any
CPU in any package in the system even if inactive.

A consequence of reading a new value from hw on each call to
perf_event_read() is that reading and saving the event value in sched out
can be avoided since the value will never be utilized. Therefore, a PMU
that sets any of the PERF_INACTIVE_*_READ_* flags can choose not to read
in context switch, at the cost of inherit_stats not working properly.

Reviewed-by: Stephane Eranian <eranian@xxxxxxxxxx>
Signed-off-by: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
---
include/linux/perf_event.h | 15 ++++++++++++
kernel/events/core.c | 59 +++++++++++++++++++++++++++++++++++-----------
2 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e4c58b0..054d7f4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -607,6 +607,21 @@ struct perf_event {
/* Do not enable cgroup events in descendant cgroups. */
#define PERF_CGROUP_NO_RECURSION (1 << 0)

+/* CPU Event can read from event::cpu's package even if not in
+ * PERF_EVENT_STATE_ACTIVE, event::cpu must be a valid CPU.
+ */
+#define PERF_INACTIVE_CPU_READ_PKG (1 << 1)
+
+/* Event can read from any package even if not in PERF_EVENT_STATE_ACTIVE. */
+#define PERF_INACTIVE_EV_READ_ANY_CPU (1 << 2)
+
+static inline bool __perf_can_read_inactive(struct perf_event *event)
+{
+ return (event->pmu_event_flags & PERF_INACTIVE_EV_READ_ANY_CPU) ||
+ ((event->pmu_event_flags & PERF_INACTIVE_CPU_READ_PKG) &&
+ (event->cpu != -1));
+}
+
/**
* struct perf_event_context - event context structure
*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 33961ec..28d1b51 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3266,15 +3266,28 @@ static void __perf_event_read(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct pmu *pmu = event->pmu;
+ bool read_inactive = __perf_can_read_inactive(event);
+
+ WARN_ON_ONCE(event->cpu == -1 &&
+ (event->pmu_event_flags & PERF_INACTIVE_CPU_READ_PKG));
+
+ /* If inactive, we should be reading in the adequate package. */
+ WARN_ON_ONCE(
+ event->state != PERF_EVENT_STATE_ACTIVE &&
+ (event->pmu_event_flags & PERF_INACTIVE_CPU_READ_PKG) &&
+ (topology_physical_package_id(event->cpu) !=
+ topology_physical_package_id(smp_processor_id())));

/*
* If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
+ * the current task context of this cpu or if the event
+ * can be read while inactive. If cannot read while inactive
+ * and not in current cpu, then the event has been
* scheduled out before the smp call arrived. In that case
* event->count would have been updated to a recent sample
* when the event was scheduled out.
*/
- if (ctx->task && cpuctx->task_ctx != ctx)
+ if (ctx->task && cpuctx->task_ctx != ctx && !read_inactive)
return;

raw_spin_lock(&ctx->lock);
@@ -3284,9 +3297,11 @@ static void __perf_event_read(void *info)
}

update_event_times(event);
- if (event->state != PERF_EVENT_STATE_ACTIVE)
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE && !read_inactive)
goto unlock;

+
if (!data->group) {
pmu->read(event);
data->ret = 0;
@@ -3299,7 +3314,8 @@ static void __perf_event_read(void *info)

list_for_each_entry(sub, &event->sibling_list, group_entry) {
update_event_times(sub);
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ if (sub->state == PERF_EVENT_STATE_ACTIVE ||
+ __perf_can_read_inactive(sub)) {
/*
* Use sibling's PMU rather than @event's since
* sibling could be on different (eg: software) PMU.
@@ -3368,19 +3384,34 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
int ret = 0;
+ bool active = event->state == PERF_EVENT_STATE_ACTIVE;

/*
- * If event is enabled and currently active on a CPU, update the
- * value in the event structure:
+ * Read inactive event if PMU allows it. Otherwise, if event is
+ * enabled and currently active on a CPU, update the value in the
+ * event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+
+ if (active || __perf_can_read_inactive(event)) {
struct perf_read_data data = {
.event = event,
.group = group,
.ret = 0,
};
- smp_call_function_single(event->oncpu,
- __perf_event_read, &data, 1);
+ int cpu_to_read = event->oncpu;
+
+ if (!active) {
+ cpu_to_read =
+ /* if __perf_can_read_inactive is true, it
+ * either is a CPU/cgroup event or can be
+ * read for any CPU.
+ */
+ (event->pmu_event_flags &
+ PERF_INACTIVE_EV_READ_ANY_CPU) ?
+ smp_processor_id() : event->cpu;
+ }
+ smp_call_function_single(
+ cpu_to_read, __perf_event_read, &data, 1);
ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
@@ -8199,11 +8230,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->mmap_mutex);

atomic_long_set(&event->refcount, 1);
- event->cpu = cpu;
- event->attr = *attr;
- event->group_leader = group_leader;
- event->pmu = NULL;
- event->oncpu = -1;
+ event->cpu = cpu;
+ event->attr = *attr;
+ event->group_leader = group_leader;
+ event->pmu = NULL;
+ event->oncpu = -1;

event->parent = parent_event;

--
2.8.0.rc3.226.g39d4020