[PATCH 4/7] perf: avoid a bounded set of visit_groups_merge iterators

From: Ian Rogers
Date: Tue Jul 02 2019 - 03:00:47 EST


Create a per-cpu array of iterators that gets resized when cgroup events
are added. The size of the array reflects the maximum depth of cgroups,
although not all cgroups will have events monitored within them. This
approach avoids added storage cost to perf_event.

Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
---
include/linux/perf_event.h | 2 +
kernel/events/core.c | 94 ++++++++++++++++++++++++++++----------
2 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 16e38c286d46..5c479f61622c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,6 +802,8 @@ struct perf_cpu_context {
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
+ struct perf_event **visit_groups_merge_iterator_storage;
+ int visit_groups_merge_iterator_storage_size;
#endif

struct list_head sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 396b5ac6dcd4..a2c5ea868de9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1711,6 +1711,20 @@ perf_event_groups_next(struct perf_event *event)
return next;
}

+#ifdef CONFIG_CGROUP_PERF
+int perf_event_cgroup_depth(struct perf_event *event)
+{
+ struct cgroup_subsys_state *css;
+ struct perf_cgroup *cgrp = event->cgrp;
+ int depth = 0;
+
+ if (cgrp)
+ for (css = &cgrp->css; css; css = css->parent)
+ depth++;
+ return depth;
+}
+#endif
+
/*
* Iterate through the whole groups tree.
*/
@@ -2592,6 +2606,7 @@ static int __perf_install_in_context(void *info)

#ifdef CONFIG_CGROUP_PERF
if (is_cgroup_event(event)) {
+ int max_iterators;
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2599,6 +2614,30 @@ static int __perf_install_in_context(void *info)
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
reprogram = cgroup_is_descendant(cgrp->css.cgroup,
event->cgrp->css.cgroup);
+
+ /*
+ * Ensure space for visit_groups_merge iterator storage. With
+ * cgroup profiling we may have an event at each depth plus
+ * system wide events.
+ */
+ max_iterators = perf_event_cgroup_depth(event) + 1;
+ if (max_iterators >
+ cpuctx->visit_groups_merge_iterator_storage_size) {
+ struct perf_event **storage =
+ krealloc(cpuctx->visit_groups_merge_iterator_storage,
+ sizeof(struct perf_event *) * max_iterators,
+ GFP_KERNEL);
+ if (storage) {
+ cpuctx->visit_groups_merge_iterator_storage
+ = storage;
+ cpuctx->visit_groups_merge_iterator_storage_size
+ = max_iterators;
+ } else {
+ WARN_ONCE(1, "Unable to increase iterator "
+ "storage for perf events with cgroups");
+ ret = -ENOMEM;
+ }
+ }
}
#endif

@@ -3389,6 +3428,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
}
}

+
+/*
+ * Without cgroups, with a task context, there may be per-CPU and any
+ * CPU events.
+ */
+#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+
static int visit_groups_merge(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
struct perf_event_groups *groups,
@@ -3398,35 +3444,27 @@ static int visit_groups_merge(struct perf_event_context *ctx,
int *),
int *data)
{
-#ifndef CONFIG_CGROUP_PERF
- /*
- * Without cgroups, with a task context, iterate over per-CPU and any
- * CPU events.
- */
- const int max_itrs = 2;
-#else
- /*
- * The depth of cgroups is limited by MAX_PATH. It is unlikely that this
- * many parent-child related cgroups will have perf events
- * monitored. Limit the number of cgroup iterators to 16.
- */
- const int max_cgroups_with_events_depth = 16;
- /*
- * With cgroups we either iterate for a task context (per-CPU or any CPU
- * events) or for per CPU the global and per cgroup events.
- */
- const int max_itrs = max(2, 1 + max_cgroups_with_events_depth);
-#endif
/*
* A set of iterators, the iterator for the visit is chosen by the
* group_index.
*/
- struct perf_event *itrs[max_itrs];
+#ifndef CONFIG_CGROUP_PERF
+ struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
struct perf_event_heap heap = {
.storage = itrs,
.num_elements = 0,
- .max_elements = max_itrs
+ .max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
};
+#else
+ /*
+ * With cgroups usage space in the CPU context reserved for iterators.
+ */
+ struct perf_event_heap heap = {
+ .storage = cpuctx->visit_groups_merge_iterator_storage,
+ .num_elements = 0,
+ .max_elements = cpuctx->visit_groups_merge_iterator_storage_size
+ };
+#endif
int ret, cpu = smp_processor_id();

heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
@@ -3461,9 +3499,8 @@ static int visit_groups_merge(struct perf_event_context *ctx,
heap.num_elements++;
if (heap.num_elements ==
heap.max_elements) {
- WARN_ONCE(
- max_cgroups_with_events_depth,
- "Insufficient iterators for cgroup depth");
+ WARN_ONCE(1,
+ "per-CPU min-heap under sized");
break;
}
}
@@ -10155,7 +10192,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
+#ifdef CONFIG_CGROUP_PERF
+ cpuctx->visit_groups_merge_iterator_storage =
+ kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+ sizeof(struct perf_event *),
+ GFP_KERNEL);
+ cpuctx->visit_groups_merge_iterator_storage_size =
+ MIN_VISIT_GROUP_MERGE_ITERATORS;
+#endif
__perf_mux_hrtimer_init(cpuctx, cpu);
}

--
2.22.0.410.gd8fdbe21b5-goog