[RFC 2/6] perf/core: add a rb-tree index to inactive_groups

From: David Carrillo-Cisneros
Date: Tue Jan 10 2017 - 05:26:16 EST


Add a rb-tree that indexes inactive events by {CPU/cgroup,flexible,stamp}.

The original idea by Peter Z. was to sort task events in an rb-tree using
{pmu,cpu,timestamp} as key.

Having the PMU as part of the key gets complicated for contexts that
share pmus (i.e. software context) because all events in a context should
rotate together irrespective of their pmu. It's also unclear to me that
there is any case where a seach by pmu is useful.

Another complicatino is that using ctx->time (or timestamp) implies that
groups added during the same context switch may not have unique key.
This increases the complexity of that finds all events in the rb-tree
that are within a time interval.

Lastly, it is useful to query pinned and flexible events separately since
they are scheduled in at different times.

For the reasons above, I created a rb-tree per context with key
{CPU,flexible,stamp} for task contexts and {cgroup,flexible,stamp} for
CPU contexts.

The "flexible" boolean allows to query pinned or flexible events
separately.
The stamp is given by a non-decreasing counter: ctx->nr_events_added.
It increases every time a new inactive event is inserted. That choice of
stamp guarantees unique keys for all events and that events of the same
type (same {CPU/cgroup,flexible}) have the same order in the rb-tree.

When events are scheduled in or rotated, all events in the context must be
iterated or rotated together, irrespective of the CPU/cgroup. To do that,
we add ctx->inactive_groups, a list that "threads" the rb-tree in total
ctx->nr_events_added order. Note that this order is the same as timestamp
order and ctx->inactive_groups is used for both scheduling and iteration.
The rb-tree can be seen as an index over ctx->inactive_groups.

Signed-off-by: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
---
include/linux/perf_event.h | 5 +++
kernel/events/core.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 87 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3fa18f05c9b0..fd32ecc37d33 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -564,6 +564,9 @@ struct perf_event {
struct list_head group_entry;
struct list_head sibling_list;

+ u64 rbtree_key;
+ struct rb_node rbtree_node;
+
/*
* We need storage to track the entries in perf_pmu_migrate_context; we
* cannot use the event_entry because of RCU and we want to keep the
@@ -736,6 +739,8 @@ struct perf_event_context {
struct list_head pinned_groups;
struct list_head flexible_groups;

+ struct rb_root rbtree_root;
+ u32 nr_inactive_added;
struct list_head active_pinned_groups;
struct list_head active_flexible_groups;
struct list_head inactive_groups;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b744b5a8dbd0..623d81c0ca93 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1462,19 +1462,98 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
return &ctx->flexible_groups;
}

+/*
+ * The bits perf_event::kbtree_key represent:
+ * - 63:33 an unique identifier for CPU (if a task context) or a cgroup
+ * (if a CPU context).
+ * - 32 a boolean to indicate if eventt is flexible (vs pinnned).
+ * - 31:0 a unique "stamp" that follows the last time the event was
+ * scheduled.
+ * The 64 bits value groups event of the same type (CPU/cgroup + flexible)
+ * together in the rb-tree.
+ */
+#define RBTREE_KEY_STAMP_WIDTH 32
+#define RBTREE_KEY_STAMP_MASK GENMASK_ULL(RBTREE_KEY_STAMP_WIDTH - 1, 0)
+#define RBTREE_KEY_FLEXIBLE_MASK BIT_ULL(RBTREE_KEY_STAMP_WIDTH)
+
+static u64 taskctx_rbtree_key(int cpu, bool flexible)
+{
+ /*
+ * Use CPU only. PMU is never used in schedule in/out and, since some
+ * contexts share PMU, iterate over them would make things complicated.
+ * I could not find a case where an ordered iteration over all PMU
+ * events in one context is useful.
+ */
+ return ((u64)cpu << (RBTREE_KEY_STAMP_WIDTH + 1)) |
+ (flexible ? RBTREE_KEY_FLEXIBLE_MASK : 0);
+}
+
+static u64 cpuctx_rbtree_key(struct perf_cgroup *cgrp, bool flexible)
+{
+ u64 k;
+
+ if (cgrp)
+ /* A cheap way to obtain an identifier for a cgroup. Suggestions appreciated. */
+ k = (u64)cgrp->css.id << (RBTREE_KEY_STAMP_WIDTH + 1);
+ else
+ k = GENMASK_ULL(63, RBTREE_KEY_STAMP_WIDTH + 1);
+ return k | (flexible ? RBTREE_KEY_FLEXIBLE_MASK : 0);
+}
+
+static void
+rbtree_add_inactive(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct rb_node **pos = &(ctx->rbtree_root.rb_node), *parent = NULL;
+ struct perf_event *pos_event;
+
+ event->rbtree_key &= ~RBTREE_KEY_STAMP_MASK;
+ /*
+ * A unique key simplifies finding intervals of events. We could use
+ * ctx time as timestamp, but it may no be unique. So use
+ * nr_inactive_added, a counter that is guaranteed to be unique and that
+ * has the same order as ctx->inactive_groups.
+ */
+ event->rbtree_key |= ctx->nr_inactive_added;
+ while (*pos) {
+ pos_event = rb_entry(*pos, struct perf_event, rbtree_node);
+ parent = *pos;
+ if (event->rbtree_key < pos_event->rbtree_key)
+ pos = &((*pos)->rb_left);
+ else /* There cannot be repeated keys. */
+ pos = &((*pos)->rb_right);
+ }
+ /* Add new node and rebalance tree. */
+ rb_link_node(&event->rbtree_node, parent, pos);
+ rb_insert_color(&event->rbtree_node, &ctx->rbtree_root);
+}
+
static void
ctx_sched_groups_to_inactive(struct perf_event *event,
struct perf_event_context *ctx)
{
WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE);
list_move_tail(&event->ctx_active_entry, &ctx->inactive_groups);
+ rbtree_add_inactive(event, ctx);
+ ctx->nr_inactive_added++;
};

static void
ctx_sched_groups_add(struct perf_event *event, struct perf_event_context *ctx)
{
+ u64 k;
+
+ WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE);
+ if (event->attach_state & PERF_ATTACH_TASK)
+ k = taskctx_rbtree_key(event->cpu, !event->attr.pinned);
+ else
+ k = cpuctx_rbtree_key(event->cgrp, !event->attr.pinned);
+ event->rbtree_key = k;
+
WARN_ON(!list_empty(&event->ctx_active_entry));
list_add_tail(&event->ctx_active_entry, &ctx->inactive_groups);
+ rbtree_add_inactive(event, ctx);
+ ctx->nr_inactive_added++;
}

/*
@@ -1668,6 +1747,7 @@ static void ctx_sched_groups_del(struct perf_event *group,
struct perf_event_context *ctx)
{
WARN_ON(group->state != PERF_EVENT_STATE_INACTIVE);
+ rb_erase(&group->rbtree_node, &ctx->rbtree_root);
list_del_init(&group->ctx_active_entry);
}

@@ -2055,6 +2135,7 @@ ctx_sched_groups_to_active(struct perf_event *event, struct perf_event_context *
WARN_ON(!event);
WARN_ON(list_empty(&event->ctx_active_entry));
WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
+ rb_erase(&event->rbtree_node, &ctx->rbtree_root);
list_move_tail(&event->ctx_active_entry, h);
}

@@ -3690,6 +3771,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
+ ctx->rbtree_root = RB_ROOT;
INIT_LIST_HEAD(&ctx->active_pinned_groups);
INIT_LIST_HEAD(&ctx->active_flexible_groups);
INIT_LIST_HEAD(&ctx->inactive_groups);
--
2.11.0.390.gc69c2f50cf-goog