[PATCH 12/14] perf/core,x86/cqm: Add read for Cgroup events,per pkg reads.

From: Vikas Shivappa
Date: Fri Dec 16 2016 - 18:15:15 EST


For cqm cgroup events, the events can be read even if the event was not
active on the cpu on which the event is being read. This is because the
RMIDs are per package and hence if we read the llc_occupancy value on a
cpu x, we are really reading the occupancy for the package where cpu x
belongs.

This patch adds a PERF_INACTIVE_CPU_READ_PKG to indicate this behaviour
of cqm and also changes the perf/core to still call the reads even when
the event is inactive on the cpu for cgroup events. The task events have
event->cpu as -1 and hence it does not apply for task events.

Tests: perf stat -C <cpux> would not return a count before this patch to
the perf/core. After this patch the count of the package is returned to
the perf/core. We still dont see the count in the perf user mode - that
is fixed in next patches.

Patch is based on David Carrillo-Cisneros <davidcc@xxxxxxxxxx> patches
in cqm2 series.

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
arch/x86/events/intel/cqm.c | 1 +
include/linux/perf_event.h | 19 ++++++++++++++++---
kernel/events/core.c | 16 ++++++++++++----
3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index e0d4017..04723cc 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -1130,6 +1130,7 @@ static int intel_cqm_event_init(struct perf_event *event)
* cgroup hierarchies.
*/
event->event_caps |= PERF_EV_CAP_CGROUP_NO_RECURSION;
+ event->event_caps |= PERF_EV_CAP_INACTIVE_CPU_READ_PKG;

mutex_lock(&cache_mutex);

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index abeacb5..e55d709 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -525,10 +525,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
* PERF_EV_CAP_CGROUP_NO_RECURSION: A cgroup event that handles its own
* cgroup scoping. It does not need to be enabled for all of its descendants
* cgroups.
+ * PERF_EV_CAP_INACTIVE_CPU_READ_PKG: A cgroup event where we can read
+ * the package count on any cpu on the pkg even if inactive.
*/
-#define PERF_EV_CAP_SOFTWARE BIT(0)
-#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
-#define PERF_EV_CAP_CGROUP_NO_RECURSION BIT(2)
+#define PERF_EV_CAP_SOFTWARE BIT(0)
+#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
+#define PERF_EV_CAP_CGROUP_NO_RECURSION BIT(2)
+#define PERF_EV_CAP_INACTIVE_CPU_READ_PKG BIT(3)

#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
@@ -722,6 +725,16 @@ struct perf_event {
#endif /* CONFIG_PERF_EVENTS */
};

+#ifdef CONFIG_PERF_EVENTS
+static inline bool __perf_can_read_inactive(struct perf_event *event)
+{
+ if ((event->group_caps & PERF_EV_CAP_INACTIVE_CPU_READ_PKG))
+ return true;
+
+ return false;
+}
+#endif
+
/**
* struct perf_event_context - event context structure
*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a290c53..9c070b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3435,9 +3435,13 @@ struct perf_read_data {

static int find_cpu_to_read(struct perf_event *event, int local_cpu)
{
+ bool active = event->state == PERF_EVENT_STATE_ACTIVE;
int event_cpu = event->oncpu;
u16 local_pkg, event_pkg;

+ if (__perf_can_read_inactive(event) && !active)
+ event_cpu = event->cpu;
+
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
@@ -3459,6 +3463,7 @@ static void __perf_event_read(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct pmu *pmu = event->pmu;
+ bool read_inactive = __perf_can_read_inactive(event);

/*
* If this is a task context, we need to check whether it is
@@ -3467,7 +3472,7 @@ static void __perf_event_read(void *info)
* event->count would have been updated to a recent sample
* when the event was scheduled out.
*/
- if (ctx->task && cpuctx->task_ctx != ctx)
+ if (ctx->task && cpuctx->task_ctx != ctx && !read_inactive)
return;

raw_spin_lock(&ctx->lock);
@@ -3477,7 +3482,7 @@ static void __perf_event_read(void *info)
}

update_event_times(event);
- if (event->state != PERF_EVENT_STATE_ACTIVE)
+ if (ctx->task && cpuctx->task_ctx != ctx && !read_inactive)
goto unlock;

if (!data->group) {
@@ -3492,7 +3497,8 @@ static void __perf_event_read(void *info)

list_for_each_entry(sub, &event->sibling_list, group_entry) {
update_event_times(sub);
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ if (sub->state == PERF_EVENT_STATE_ACTIVE ||
+ __perf_can_read_inactive(sub)) {
/*
* Use sibling's PMU rather than @event's since
* sibling could be on different (eg: software) PMU.
@@ -3570,13 +3576,15 @@ u64 perf_event_read_local(struct perf_event *event)

static int perf_event_read(struct perf_event *event, bool group)
{
+ bool active = event->state == PERF_EVENT_STATE_ACTIVE;
int ret = 0, cpu_to_read, local_cpu;

/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (active || __perf_can_read_inactive(event)) {
+
struct perf_read_data data = {
.event = event,
.group = group,
--
1.9.1