[PATCH v4 06/10] perf/cgroup: Order events in RB tree by cgroup id

From: Ian Rogers
Date: Fri Nov 15 2019 - 20:19:13 EST


If one is monitoring 6 events on 20 cgroups the per-CPU RB tree will
hold 120 events. The scheduling in of the events currently iterates
over all events looking to see which events match the task's cgroup or
its cgroup hierarchy. If a task is in 1 cgroup with 6 events, then 114
events are considered unnecessarily.

This change orders events in the RB tree by cgroup id if it is present.
This means scheduling in may go directly to events associated with the
task's cgroup if one is present. The per-CPU iterator storage in
visit_groups_merge is sized sufficent for an iterator per cgroup depth,
where different iterators are needed for the task's cgroup and parent
cgroups. By considering the set of iterators when visiting, the lowest
group_index event may be selected and the insertion order group_index
property is maintained. This also allows event rotation to function
correctly, as although events are grouped into a cgroup, rotation always
selects the lowest group_index event to rotate (delete/insert into the
tree) and the min heap of iterators make it so that the group_index order
is maintained.

Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Stephane Eranian <eranian@xxxxxxxxxx>
Cc: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Link: https://lkml.kernel.org/r/20190724223746.153620-3-irogers@xxxxxxxxxx
---
kernel/events/core.c | 97 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 87 insertions(+), 10 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8817c645bef9..f26871ef352a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1576,6 +1576,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
if (left->cpu > right->cpu)
return false;

+#ifdef CONFIG_CGROUP_PERF
+ if (left->cgrp != right->cgrp) {
+ if (!left->cgrp || !left->cgrp->css.cgroup) {
+ /*
+ * Left has no cgroup but right does, no cgroups come
+ * first.
+ */
+ return true;
+ }
+ if (!right->cgrp || right->cgrp->css.cgroup) {
+ /*
+ * Right has no cgroup but left does, no cgroups come
+ * first.
+ */
+ return false;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (left->cgrp->css.cgroup->id < right->cgrp->css.cgroup->id)
+ return true;
+
+ return false;
+ }
+#endif
+
if (left->group_index < right->group_index)
return true;
if (left->group_index > right->group_index)
@@ -1655,25 +1679,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}

/*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
*/
static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+ struct cgroup *cgrp)
{
struct perf_event *node_event = NULL, *match = NULL;
struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+ int node_cgrp_id, cgrp_id = 0;
+
+ if (cgrp)
+ cgrp_id = cgrp->id;
+#endif

while (node) {
node_event = container_of(node, struct perf_event, group_node);

if (cpu < node_event->cpu) {
node = node->rb_left;
- } else if (cpu > node_event->cpu) {
+ continue;
+ }
+ if (cpu > node_event->cpu) {
node = node->rb_right;
- } else {
- match = node_event;
+ continue;
+ }
+#ifdef CONFIG_CGROUP_PERF
+ node_cgrp_id = 0;
+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
+ node_cgrp_id = node_event->cgrp->css.cgroup->id;
+
+ if (cgrp_id < node_cgrp_id) {
node = node->rb_left;
+ continue;
+ }
+ if (cgrp_id > node_cgrp_id) {
+ node = node->rb_right;
+ continue;
}
+#endif
+ match = node_event;
+ node = node->rb_left;
}

return match;
@@ -1686,12 +1733,26 @@ static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+ int curr_cgrp_id = 0;
+ int next_cgrp_id = 0;
+#endif

next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next && next->cpu == event->cpu)
- return next;
+ if (next == NULL || next->cpu != event->cpu)
+ return NULL;

- return NULL;
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp && event->cgrp->css.cgroup)
+ curr_cgrp_id = event->cgrp->css.cgroup->id;
+
+ if (next->cgrp && next->cgrp->css.cgroup)
+ next_cgrp_id = next->cgrp->css.cgroup->id;
+
+ if (curr_cgrp_id != next_cgrp_id)
+ return NULL;
+#endif
+ return next;
}

/*
@@ -3453,6 +3514,9 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
int (*func)(struct perf_event *, void *),
void *data)
{
+#ifdef CONFIG_CGROUP_PERF
+ struct cgroup_subsys_state *css = NULL;
+#endif
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_max_heap event_heap;
@@ -3468,6 +3532,11 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
};

lockdep_assert_held(&cpuctx->ctx.lock);
+
+#ifdef CONFIG_CGROUP_PERF
+ if (cpuctx->cgrp)
+ css = &cpuctx->cgrp->css;
+#endif
} else {
event_heap = (struct min_max_heap){
.data = itrs,
@@ -3475,11 +3544,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.cap = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1,
+ NULL));
}
evt = event_heap.data;

- __heap_add(&event_heap, perf_event_groups_first(groups, cpu));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+
+#ifdef CONFIG_CGROUP_PERF
+ for (; css; css = css->parent) {
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu,
+ css->cgroup));
+ }
+#endif

min_max_heapify_all(&event_heap, &perf_min_heap);

--
2.24.0.432.g9d3f5f5b63-goog