[RFC 3/6] perf/core: use rb-tree to sched in event groups

From: David Carrillo-Cisneros
Date: Tue Jan 10 2017 - 05:25:52 EST


During sched in, only events that match the current CPU and cgroup can
be scheduled in. These events should be added to the PMU in increasing
timestamp order to guaratee fairness in the event rotation.

In task contexts, events with no CPU affinity or with affinity to the
current CPU are eligible.
In CPU contexts, events with no cgroup restriction (CPU events) or with
a cgroup that is current (either current's task cgroup or one of its
ancestor) are eligible.

For both context types, we need to iterate over events with different
rb-tree key groups (different CPUs or different cgroups) in timestamp
order.

Finding the events in timestamp order is at odds with indexing by
by CPU/cgroup. The rb-tree allows us to find events with minimum and
maximum timestamp for a given CPU/cgroup + flexible type. The list
ctx->inactive_groups is sorted by timestamp.

We could find a list position for the first event of each CPU/cgroup that
is to be scheduled and iterate over all of them, selecting events from
the list's head with the smallest timestampt, but it's too complicated.

A simpler alternative is to find the smallest subinterval of
ctx->inactive_groups that contains all eligible events. Let's call this
minimum subinterval S.

S is formed of smaller subintervals, no necessarily exclusive, intervals.
Each one has all the events that are eligible for a given CPU or cgroup.
We find S by searching for the start/end of each one of these CPU/cgroup
subintervals and combining them. The drawback is that there may be
events in S that are not eligible (since ctx->inactive_group is in stamp
order).
Yet, it is more efficient that it may look since the stamp of newly
inserted events is higher than events that are not eligible.
Eventually, events of ineligible CPU/cgroups are not longer part of S.

More formally, if:
- n is the number of events in the context.
- s is the number of eligible events.
- q is the number of events scheduled each context switch.

Then, in the worst case, |S|=n in the first context switch and q events
are scheduled in and out. Events scheduled out are added at the end of
ctx->inactive_groups. The next sched in continues from the last
scheduled event, hence skipping all events that were found not eligible
in the previous context switch. Therefore, every context switch reduces
the size of |S| by an average of n/q and it takes up to ceiling(s/q)
context switches for all ineligible events to be visited once and
skipped. All subsequent context switches will have |S|=s.

Therefore, the worst case is linear and a not optimal case only
occurs when the criteria to elect events changes (i.e. CPU migration or
cgroup change).

Signed-off-by: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
---
kernel/events/core.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 190 insertions(+), 2 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 623d81c0ca93..302f13ca75dc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1476,6 +1476,11 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
#define RBTREE_KEY_STAMP_MASK GENMASK_ULL(RBTREE_KEY_STAMP_WIDTH - 1, 0)
#define RBTREE_KEY_FLEXIBLE_MASK BIT_ULL(RBTREE_KEY_STAMP_WIDTH)

+static u32 rbtree_key_stamp(u64 k)
+{
+ return k & RBTREE_KEY_STAMP_MASK;
+}
+
static u64 taskctx_rbtree_key(int cpu, bool flexible)
{
/*
@@ -3087,16 +3092,191 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}

+/* Find key max/min rbtree node with respect to bound that matches
+ * bound non-tstamp attributes in bound. */
+static struct perf_event *
+rbtree_find_bound(struct perf_event_context *ctx, u64 type_key, bool find_min)
+{
+ struct rb_node *pos = ctx->rbtree_root.rb_node;
+ struct perf_event *pos_event, *m = NULL;
+ u64 k, k_h, bound_h, bound;
+
+ bound = type_key | RBTREE_KEY_STAMP_MASK;
+ if (find_min)
+ bound &= ~RBTREE_KEY_STAMP_MASK;
+ while (pos) {
+ pos_event = rb_entry(pos, struct perf_event, rbtree_node);
+ WARN_ON(!pos_event);
+ k = pos_event->rbtree_key;
+ /*
+ * Bits higher than tstamp indicate the node type, look for
+ * node of same type as bound that is min (or max).
+ */
+ k_h = k & ~RBTREE_KEY_STAMP_MASK;
+ bound_h = bound & ~RBTREE_KEY_STAMP_MASK;
+
+ if (k_h == bound_h) {
+ if (!m)
+ m = pos_event;
+ else if ((!find_min && k > m->rbtree_key) ||
+ (find_min && k < m->rbtree_key))
+ m = pos_event;
+ }
+ if (bound < k)
+ pos = pos->rb_left;
+ else
+ pos = pos->rb_right;
+ }
+ return m;
+}
+
+/*
+ * Update start and stamp_max so that all events in @ctx with matching @key
+ * are contained in the [start->rbtree_stamp, stamp_max] interval.
+ */
+static void
+accumulate_ctx_interval(struct perf_event_context *ctx, u64 key_type,
+ struct perf_event **start, u32 *stamp_max)
+{
+ u64 start_stamp, new_start_stamp, new_end_stamp;
+ struct perf_event *new_start, *new_end;
+
+ WARN_ON(!ctx);
+ WARN_ON(!start);
+ WARN_ON(!stamp_max);
+
+ /* Find node with smallest stamp and key type (CPU/cgroup,flexible). */
+ new_start = rbtree_find_bound(ctx, key_type, true);
+ if (!new_start)
+ return;
+
+ /* Find node with largest stamp and key type (CPU/cgroup,flexible). */
+ new_end = rbtree_find_bound(ctx, key_type, false);
+ WARN_ON(!new_end);
+ new_end_stamp = rbtree_key_stamp(new_end->rbtree_key);
+
+ if (!(*start)) {
+ *start = new_start;
+ *stamp_max = new_end_stamp;
+ } else {
+ start_stamp = rbtree_key_stamp((*start)->rbtree_key);;
+ new_start_stamp = rbtree_key_stamp(new_start->rbtree_key);
+
+ if (start_stamp > new_start_stamp)
+ *start = new_start;
+ if (*stamp_max < new_end_stamp)
+ *stamp_max = new_end_stamp;
+ }
+}
+
+static void
+task_ctx_interval(struct perf_event_context *ctx, enum event_type_t type,
+ struct perf_event **start, u32 *stamp_max)
+{
+ u64 k;
+
+ WARN_ON(!ctx);
+ /* Task events not bound to a CPU. */
+ if (type & EVENT_PINNED) {
+ k = taskctx_rbtree_key(-1, false);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+ if (type & EVENT_FLEXIBLE) {
+ k = taskctx_rbtree_key(-1, true);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+
+ /* Task events bound to current CPU. */
+ if (type & EVENT_PINNED) {
+ k = taskctx_rbtree_key(smp_processor_id(), false);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+ if (type & EVENT_FLEXIBLE) {
+ k = taskctx_rbtree_key(smp_processor_id(), true);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+
+}
+
+static void
+cpu_ctx_interval(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t type,
+ struct perf_event **start, u32 *stamp_max)
+{
+ struct cgroup_subsys_state *css;
+ struct perf_cgroup *cgrp;
+ u64 k;
+
+ WARN_ON(!ctx);
+ WARN_ON(!cpuctx);
+ if (type & EVENT_PINNED) {
+ k = cpuctx_rbtree_key(NULL, false);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+ if (type & EVENT_FLEXIBLE) {
+ k = cpuctx_rbtree_key(NULL, true);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+
+#ifdef CONFIG_CGROUP_PERF
+ if (!cpuctx->cgrp)
+ return;
+ /*
+ * Find for interval with event for all schedulable cgroups, these
+ * include ancestors, since cgroups are hierarchical.
+ */
+ WARN_ON(!cpuctx->cgrp);
+ css = &cpuctx->cgrp->css;
+ while (css) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ if (type & EVENT_PINNED) {
+ k = cpuctx_rbtree_key(cgrp, false);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+ if (type & EVENT_FLEXIBLE) {
+ k = cpuctx_rbtree_key(cgrp, true);
+ accumulate_ctx_interval(ctx, k, start, stamp_max);
+ }
+ css = css->parent;
+ }
+#endif
+}
+
+static void
+ctx_inactive_groups_interval(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t type,
+ struct perf_event **start,
+ u32 *stamp_max)
+{
+ if (WARN_ON(!ctx->task && !cpuctx))
+ return;
+ if (ctx->task)
+ task_ctx_interval(ctx, type, start, stamp_max);
+ else
+ cpu_ctx_interval(ctx, cpuctx, type, start, stamp_max);
+}
+
static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
struct perf_event *event = NULL, *tmp;
+ u32 stamp_max;

- list_for_each_entry_safe(
+ ctx_inactive_groups_interval(ctx, cpuctx, EVENT_PINNED, &event, &stamp_max);
+ if (!event)
+ return;
+
+ list_for_each_entry_safe_from(
event, tmp, &ctx->inactive_groups, ctx_active_entry) {
if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
continue;
+ /* Break with tstamp and not by event pointer. */
+ if (rbtree_key_stamp(event->rbtree_key) > stamp_max)
+ break;
+
if (!event_filter_match(event))
continue;

@@ -3125,11 +3305,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
{
struct perf_event *event = NULL, *tmp;
int can_add_hw = 1;
+ u32 stamp_max;

- list_for_each_entry_safe(
+ ctx_inactive_groups_interval(ctx, cpuctx, EVENT_FLEXIBLE, &event, &stamp_max);
+ if (!event)
+ return;
+
+ list_for_each_entry_safe_from(
event, tmp, &ctx->inactive_groups, ctx_active_entry) {
if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
continue;
+ /* Break with tstamp and not by event pointer. */
+ if (rbtree_key_stamp(event->rbtree_key) > stamp_max)
+ break;
/*
* Listen to the 'cpu' scheduling filter constraint
* of events:
--
2.11.0.390.gc69c2f50cf-goog