[PATCH v7] perf: Sharing PMU counters across compatible events

From: Song Liu
Date: Thu Jun 07 2018 - 02:24:10 EST


This patch tries to enable PMU sharing. When multiple perf_events are
counting the same metric, they can share the hardware PMU counter. We
call these events as "compatible events".

The PMU sharing are limited to events within the same perf_event_context
(ctx). When a event is installed or enabled, search the ctx for compatible
events. This is implemented in perf_event_setup_dup(). One of these
compatible events are picked as the master (stored in event->dup_master).
Similarly, when the event is removed or disabled, perf_event_remove_dup()
is used to clean up sharing.

A new state PERF_EVENT_STATE_ENABLED is introduced for the master event.
This state is used when the slave event is ACTIVE, but the master event
is not.

On the critical paths (add, del read), sharing PMU counters doesn't
increase the complexity. Helper functions event_pmu_[add|del|read]() are
introduced to cover these cases. All these functions have O(1) time
complexity.

Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Signed-off-by: Song Liu <songliubraving@xxxxxx>

---
Changes in v7:
Major rewrite to avoid allocating extra master event.
---
include/linux/perf_event.h | 14 +-
kernel/events/core.c | 319 ++++++++++++++++++++++++++++++++++---
2 files changed, 309 insertions(+), 24 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 68ccc5b1913b..bb05b178841d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -522,7 +522,9 @@ enum perf_event_state {
PERF_EVENT_STATE_ERROR =3D -2,
PERF_EVENT_STATE_OFF =3D -1,
PERF_EVENT_STATE_INACTIVE =3D 0,
- PERF_EVENT_STATE_ACTIVE =3D 1,
+ /* the hw PMC is enabled, but this event is not counting */
+ PERF_EVENT_STATE_ENABLED =3D 1,
+ PERF_EVENT_STATE_ACTIVE =3D 2,
};
=20
struct file;
@@ -722,6 +724,16 @@ struct perf_event {
#endif
=20
struct list_head sb_list;
+
+ /* for PMU sharing */
+ struct perf_event *dup_master;
+ /* check event_sync_dup_count() for the use of dup_base_* */
+ u64 dup_base_count;
+ u64 dup_base_child_count;
+ /* when this event is master, read from master*count */
+ local64_t master_count;
+ atomic64_t master_child_count;
+ int dup_active_count;
#endif /* CONFIG_PERF_EVENTS */
};
=20
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aec8dba2bea4..00b1e19e70fd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1657,6 +1657,139 @@ perf_event_groups_next(struct perf_event *event)
event =3D rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
=20
+static inline bool perf_event_can_share(struct perf_event *event)
+{
+ /* only share hardware counting events */
+ return !is_software_event(event) && !is_sampling_event(event);
+}
+
+/*
+ * Returns whether the two events can share a PMU counter.
+ *
+ * Note: This function does NOT check perf_event_can_share() for
+ * the two events, they should be checked before this function
+ */
+static inline bool perf_event_compatible(struct perf_event *event_a,
+ struct perf_event *event_b)
+{
+ return event_a->attr.type =3D=3D event_b->attr.type &&
+ event_a->attr.config =3D=3D event_b->attr.config &&
+ event_a->attr.config1 =3D=3D event_b->attr.config1 &&
+ event_a->attr.config2 =3D=3D event_b->attr.config2;
+}
+
+/* prepare the dup_master, this event is its own dup_master */
+static void perf_event_init_dup_master(struct perf_event *event)
+{
+ event->dup_master =3D event;
+ /*
+ * dup_master->count is used by the hw PMC, and shared with other
+ * events, so we have to read from dup_master->master_count. Copy
+ * event->count to event->master_count.
+ *
+ * Same logic for child_count and master_child_count.
+ */
+ local64_set(&event->master_count, local64_read(&event->count));
+ atomic64_set(&event->master_child_count,
+ atomic64_read(&event->child_count));
+
+ event->dup_active_count =3D 0;
+}
+
+/* tear down dup_master, no more sharing for this event */
+static void perf_event_exit_dup_master(struct perf_event *event)
+{
+ WARN_ON_ONCE(event->dup_active_count);
+
+ event->dup_master =3D NULL;
+ /* restore event->count and event->child_count */
+ local64_set(&event->count, local64_read(&event->master_count));
+ atomic64_set(&event->child_count,
+ atomic64_read(&event->master_child_count));
+}
+
+/* After adding a event to the ctx, try find compatible event(s). */
+static void perf_event_setup_dup(struct perf_event *event,
+ struct perf_event_context *ctx)
+
+{
+ struct perf_event *tmp;
+
+ if (event->dup_master ||
+ event->state !=3D PERF_EVENT_STATE_INACTIVE ||
+ !perf_event_can_share(event))
+ return;
+
+ /* look for dup with other events */
+ list_for_each_entry(tmp, &ctx->event_list, event_entry) {
+ WARN_ON_ONCE(tmp->state > PERF_EVENT_STATE_INACTIVE);
+
+ if (tmp =3D=3D event ||
+ tmp->state !=3D PERF_EVENT_STATE_INACTIVE ||
+ !perf_event_can_share(tmp) ||
+ !perf_event_compatible(event, tmp))
+ continue;
+
+ /* first dup, pick tmp as the master */
+ if (!tmp->dup_master)
+ perf_event_init_dup_master(tmp);
+
+ event->dup_master =3D tmp->dup_master;
+ break;
+ }
+}
+
+/* Remove dup_master for the event */
+static void perf_event_remove_dup(struct perf_event *event,
+ struct perf_event_context *ctx)
+
+{
+ struct perf_event *tmp, *new_master;
+ int count;
+
+ /* no sharing */
+ if (!event->dup_master)
+ return;
+
+ WARN_ON_ONCE(event->state !=3D PERF_EVENT_STATE_INACTIVE &&
+ event->state !=3D PERF_EVENT_STATE_OFF);
+
+ /* this event is not the master */
+ if (event->dup_master !=3D event) {
+ event->dup_master =3D NULL;
+ return;
+ }
+
+ /* this event is the master */
+ perf_event_exit_dup_master(event);
+ count =3D 0;
+ new_master =3D NULL;
+ list_for_each_entry(tmp, &ctx->event_list, event_entry) {
+ WARN_ON_ONCE(tmp->state > PERF_EVENT_STATE_INACTIVE);
+ if (tmp->dup_master =3D=3D event) {
+ count++;
+ if (!new_master)
+ new_master =3D tmp;
+ }
+ }
+
+ if (!count)
+ return;
+
+ if (count =3D=3D 1) {
+ /* no more sharing */
+ new_master->dup_master =3D NULL;
+ return;
+ }
+
+ perf_event_init_dup_master(new_master);
+
+ /* switch to new_master */
+ list_for_each_entry(tmp, &ctx->event_list, event_entry)
+ if (tmp->dup_master =3D=3D event)
+ tmp->dup_master =3D new_master;
+}
+
/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -1689,6 +1822,7 @@ list_add_event(struct perf_event *event, struct perf_=
event_context *ctx)
ctx->nr_stat++;
=20
ctx->generation++;
+ perf_event_setup_dup(event, ctx);
}
=20
/*
@@ -1861,6 +1995,7 @@ list_del_event(struct perf_event *event, struct perf_=
event_context *ctx)
if (!(event->attach_state & PERF_ATTACH_CONTEXT))
return;
=20
+ perf_event_remove_dup(event, ctx);
event->attach_state &=3D ~PERF_ATTACH_CONTEXT;
=20
list_update_cgroup_event(event, ctx, false);
@@ -2069,6 +2204,98 @@ event_filter_match(struct perf_event *event)
perf_cgroup_match(event) && pmu_filter_match(event);
}
=20
+/* PMU sharing aware version of event->pmu->add() */
+static int event_pmu_add(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *master;
+ int ret;
+
+ /* no sharing, just do event->pmu->add() */
+ if (!event->dup_master)
+ return event->pmu->add(event, PERF_EF_START);
+
+ master =3D event->dup_master;
+
+ if (!master->dup_active_count) {
+ ret =3D event->pmu->add(master, PERF_EF_START);
+ if (ret)
+ return ret;
+
+ if (master !=3D event)
+ perf_event_set_state(master, PERF_EVENT_STATE_ENABLED);
+ }
+
+ master->dup_active_count++;
+ master->pmu->read(master);
+ event->dup_base_count =3D local64_read(&master->count);
+ event->dup_base_child_count =3D atomic64_read(&master->child_count);
+ return 0;
+}
+
+/*
+ * sync data count from dup->master to event, called on event_pmu_read()
+ * and event_pmu_del()
+ */
+static void event_sync_dup_count(struct perf_event *event,
+ struct perf_event *master)
+{
+ u64 new_count;
+ u64 new_child_count;
+
+ WARN_ON_ONCE(event->state !=3D PERF_EVENT_STATE_ACTIVE);
+
+ event->pmu->read(master);
+ new_count =3D local64_read(&master->count);
+ new_child_count =3D atomic64_read(&master->child_count);
+
+ if (event =3D=3D master) {
+ local64_add(new_count - event->dup_base_count,
+ &event->master_count);
+ atomic64_add(new_child_count - event->dup_base_child_count,
+ &event->master_child_count);
+ } else {
+ local64_add(new_count - event->dup_base_count, &event->count);
+ atomic64_add(new_child_count - event->dup_base_child_count,
+ &event->child_count);
+ }
+
+ /* save dup_base_* for next sync */
+ event->dup_base_count =3D new_count;
+ event->dup_base_child_count =3D new_child_count;
+}
+
+/* PMU sharing aware version of event->pmu->del() */
+static void event_pmu_del(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *master;
+
+ if (event->dup_master =3D=3D NULL) {
+ event->pmu->del(event, 0);
+ return;
+ }
+
+ master =3D event->dup_master;
+ event_sync_dup_count(event, master);
+ if (--master->dup_active_count =3D=3D 0) {
+ event->pmu->del(master, 0);
+ perf_event_set_state(master, PERF_EVENT_STATE_INACTIVE);
+ } else if (master =3D=3D event) {
+ perf_event_set_state(master, PERF_EVENT_STATE_ENABLED);
+ }
+}
+
+/* PMU sharing aware version of event->pmu->read() */
+static void event_pmu_read(struct perf_event *event)
+{
+ if (event->dup_master =3D=3D NULL) {
+ event->pmu->read(event);
+ return;
+ }
+ event_sync_dup_count(event, event->dup_master);
+}
+
static void
event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
@@ -2091,7 +2318,7 @@ event_sched_out(struct perf_event *event,
=20
perf_pmu_disable(event->pmu);
=20
- event->pmu->del(event, 0);
+ event_pmu_del(event, ctx);
event->oncpu =3D -1;
=20
if (READ_ONCE(event->pending_disable) >=3D 0) {
@@ -2140,6 +2367,14 @@ group_sched_out(struct perf_event *group_event,
=20
#define DETACH_GROUP 0x01UL
=20
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *task_ctx,
+ enum event_type_t event_type);
+
/*
* Cross CPU call to remove a performance event
*
@@ -2153,13 +2388,17 @@ __perf_remove_from_context(struct perf_event *event=
,
void *info)
{
unsigned long flags =3D (unsigned long)info;
+ bool resched =3D (event->dup_master =3D=3D event);
=20
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
}
=20
- event_sched_out(event, cpuctx, ctx);
+ if (resched)
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ else
+ event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
@@ -2171,6 +2410,9 @@ __perf_remove_from_context(struct perf_event *event,
cpuctx->task_ctx =3D NULL;
}
}
+ if (resched)
+ ctx_resched(cpuctx, cpuctx->task_ctx,
+ EVENT_ALL | (ctx->task ? 0 : EVENT_CPU));
}
=20
/*
@@ -2226,6 +2468,16 @@ static void __perf_event_disable(struct perf_event *=
event,
update_cgrp_time_from_event(event);
}
=20
+ if (event->dup_master =3D=3D event) {
+ /* disabling master, resched all */
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ perf_event_remove_dup(event, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ ctx_resched(cpuctx, cpuctx->task_ctx,
+ EVENT_ALL | (ctx->task ? 0 : EVENT_CPU));
+ return;
+ }
+
if (event =3D=3D event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
@@ -2364,7 +2616,7 @@ event_sched_in(struct perf_event *event,
=20
perf_log_itrace_start(event);
=20
- if (event->pmu->add(event, PERF_EF_START)) {
+ if (event_pmu_add(event, ctx)) {
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu =3D -1;
ret =3D -EAGAIN;
@@ -2478,9 +2730,6 @@ static void add_event_to_ctx(struct perf_event *event=
,
perf_group_attach(event);
}
=20
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
@@ -2625,9 +2874,13 @@ static int __perf_install_in_context(void *info)
#endif
=20
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ int event_type =3D perf_event_can_share(event) ? EVENT_ALL : 0;
+
+ /* if perf_event_can_share() resched EVENT_ALL */
+ ctx_sched_out(ctx, cpuctx, event_type);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx,
+ event_type | (ctx->task ? 0 : EVENT_CPU));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2745,21 +2998,26 @@ static void __perf_event_enable(struct perf_event *=
event,
{
struct perf_event *leader =3D event->group_leader;
struct perf_event_context *task_ctx;
+ int was_active;
+ int event_type;
=20
if (event->state >=3D PERF_EVENT_STATE_INACTIVE ||
event->state <=3D PERF_EVENT_STATE_ERROR)
return;
=20
+ event_type =3D perf_event_can_share(event) ? EVENT_ALL : EVENT_TIME;
+ was_active =3D ctx->is_active;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, event_type);
=20
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_event_setup_dup(event, ctx);
=20
- if (!ctx->is_active)
+ if (!was_active)
return;
=20
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, event_type, current);
return;
}
=20
@@ -2767,8 +3025,8 @@ static void __perf_event_enable(struct perf_event *ev=
ent,
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader !=3D event && leader->state !=3D PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ if (leader !=3D event && leader->state <=3D PERF_EVENT_STATE_INACTIVE) {
+ ctx_sched_in(ctx, cpuctx, event_type, current);
return;
}
=20
@@ -2776,7 +3034,8 @@ static void __perf_event_enable(struct perf_event *ev=
ent,
if (ctx->task)
WARN_ON_ONCE(task_ctx !=3D ctx);
=20
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ /* if perf_event_can_share() resched EVENT_ALL */
+ ctx_resched(cpuctx, task_ctx, get_event_type(event) | event_type);
}
=20
/*
@@ -3115,7 +3374,7 @@ static void __perf_event_sync_stat(struct perf_event =
*event,
* don't need to use it.
*/
if (event->state =3D=3D PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ event_pmu_read(event);
=20
perf_event_update_time(event);
=20
@@ -3979,14 +4238,14 @@ static void __perf_event_read(void *info)
goto unlock;
=20
if (!data->group) {
- pmu->read(event);
+ event_pmu_read(event);
data->ret =3D 0;
goto unlock;
}
=20
pmu->start_txn(pmu, PERF_PMU_TXN_READ);
=20
- pmu->read(event);
+ event_pmu_read(event);
=20
for_each_sibling_event(sub, event) {
if (sub->state =3D=3D PERF_EVENT_STATE_ACTIVE) {
@@ -3994,7 +4253,7 @@ static void __perf_event_read(void *info)
* Use sibling's PMU rather than @event's since
* sibling could be on different (eg: software) PMU.
*/
- sub->pmu->read(sub);
+ event_pmu_read(sub);
}
}
=20
@@ -4006,6 +4265,9 @@ static void __perf_event_read(void *info)
=20
static inline u64 perf_event_count(struct perf_event *event)
{
+ if (event->dup_master =3D=3D event)
+ return local64_read(&event->master_count) +
+ atomic64_read(&event->master_child_count);
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
=20
@@ -4064,9 +4326,12 @@ int perf_event_read_local(struct perf_event *event, =
u64 *value,
* oncpu =3D=3D -1).
*/
if (event->oncpu =3D=3D smp_processor_id())
- event->pmu->read(event);
+ event_pmu_read(event);
=20
- *value =3D local64_read(&event->count);
+ if (event->dup_master =3D=3D event)
+ *value =3D local64_read(&event->master_count);
+ else
+ *value =3D local64_read(&event->count);
if (enabled || running) {
u64 now =3D event->shadow_ctx_time + perf_clock();
u64 __enabled, __running;
@@ -6288,7 +6553,7 @@ static void perf_output_read_group(struct perf_output=
_handle *handle,
=20
if ((leader !=3D event) &&
(leader->state =3D=3D PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ event_pmu_read(leader);
=20
values[n++] =3D perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
@@ -6301,7 +6566,7 @@ static void perf_output_read_group(struct perf_output=
_handle *handle,
=20
if ((sub !=3D event) &&
(sub->state =3D=3D PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ event_pmu_read(sub);
=20
values[n++] =3D perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
@@ -9566,7 +9831,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(stru=
ct hrtimer *hrtimer)
if (event->state !=3D PERF_EVENT_STATE_ACTIVE)
return HRTIMER_NORESTART;
=20
- event->pmu->read(event);
+ event_pmu_read(event);
=20
perf_sample_data_init(&data, 0, event->hw.last_period);
regs =3D get_irq_regs();
@@ -11202,9 +11467,17 @@ SYSCALL_DEFINE5(perf_event_open,
perf_remove_from_context(group_leader, 0);
put_ctx(gctx);
=20
+ /*
+ * move_group only happens to sw events, from sw ctx to hw
+ * ctx. The sw events should not have valid dup_master. So
+ * it is not necessary to handle dup_events.
+ */
+ WARN_ON_ONCE(group_leader->dup_master);
+
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
put_ctx(gctx);
+ WARN_ON_ONCE(sibling->dup_master);
}
=20
/*
--=20
2.17.1