[PATCH v3 19/46] perf/x86/intel/cmt: add support for cgroup events

From: David Carrillo-Cisneros
Date: Sat Oct 29 2016 - 20:45:58 EST


First part of cgroup support for CMT.

A monr's position in monrs hierarchy depends on the position of it's
target cgroup or thread in the cgroup hierarchy.
(See code comments for details).

A monr that monitors a cgroup keeps a reference to in monr->monr_cgroup
and its used in future patches to add support for cgroup monitoring
without requiring an active perf_event at all times.

Signed-off-by: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
---
arch/x86/events/intel/cmt.c | 293 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/events/intel/cmt.h | 2 +
2 files changed, 295 insertions(+)

diff --git a/arch/x86/events/intel/cmt.c b/arch/x86/events/intel/cmt.c
index 3883cb4..a5b7d2d 100644
--- a/arch/x86/events/intel/cmt.c
+++ b/arch/x86/events/intel/cmt.c
@@ -125,6 +125,14 @@ static inline struct pmonr *pkgd_pmonr(struct pkg_data *pkgd, struct monr *monr)
return rcu_dereference_check(monr->pmonrs[pkgd->pkgid], safe);
}

+#ifdef CONFIG_CGROUP_PERF
+static inline struct cgroup_subsys_state *get_root_perf_css(void)
+{
+ /* Get css for root cgroup */
+ return init_css_set.subsys[perf_event_cgrp_id];
+}
+#endif
+
static inline void pmonr_set_rmids(struct pmonr *pmonr,
u32 sched_rmid, u32 read_rmid)
{
@@ -416,6 +424,7 @@ static void monr_dealloc(struct monr *monr)

if (WARN_ON_ONCE(monr->nr_has_user) ||
WARN_ON_ONCE(monr->nr_nolazy_rmid) ||
+ WARN_ON_ONCE(monr->mon_cgrp) ||
WARN_ON_ONCE(monr->mon_events))
return;

@@ -639,6 +648,7 @@ static int monr_apply_uflags(struct monr *monr, enum cmt_user_flags *puflags)
goto exit;
}

+/* can be NULL if the monr was for a cgroup that has gone offline. */
static inline struct monr *monr_from_event(struct perf_event *event)
{
return (struct monr *) READ_ONCE(event->hw.cmt_monr);
@@ -727,6 +737,75 @@ static int monr_append_event(struct monr *monr, struct perf_event *event)
return err;
}

+#ifdef CONFIG_CGROUP_PERF
+static inline struct monr *monr_from_perf_cgroup(struct perf_cgroup *cgrp)
+{
+ return (struct monr *)READ_ONCE(cgrp->arch_info);
+}
+
+static inline void perf_cgroup_set_monr(struct perf_cgroup *cgrp,
+ struct monr *monr)
+{
+ WRITE_ONCE(cgrp->arch_info, monr);
+}
+
+/* Get cgroup for both task and cgroup event. */
+static struct perf_cgroup *perf_cgroup_from_task_event(struct perf_event *event)
+{
+#ifdef CONFIG_LOCKDEP
+ bool rcu_safe = lockdep_is_held(&cmt_mutex);
+#endif
+
+ return container_of(
+ task_css_check(event->hw.target, perf_event_cgrp_id, rcu_safe),
+ struct perf_cgroup, css);
+}
+
+static struct perf_cgroup *perf_cgroup_from_css(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct perf_cgroup, css);
+}
+
+/**
+ * perf_cgroup_mon_started() - Tell if cgroup is monitored by its own monr.
+ *
+ * A perf_cgroup is being monitored when it is referenced back by
+ * its monr's mon_cgrp. Otherwise, the cgroup only uses the monr used to
+ * monitor another cgroup (the one that is referenced back by monr's mon_cgrp).
+ */
+static inline bool perf_cgroup_mon_started(struct perf_cgroup *cgrp)
+{
+ struct monr *monr;
+
+ /*
+ * monr can be referenced by a cgroup other than the one in its
+ * mon_cgrp, be careful.
+ */
+ monr = monr_from_perf_cgroup(cgrp);
+
+ /* Root monr do not have a cgroup associated before initialization. */
+ return monr->mon_cgrp == cgrp;
+}
+
+/**
+ * perf_cgroup_find_lma() - Find @cgrp lowest monitored ancestor.
+ *
+ * Find lowest monitored ancestor for @cgrp, not including this cgroup
+ * Return: lma or NULL if no ancestor is monitored.
+ */
+struct perf_cgroup *perf_cgroup_find_lma(struct perf_cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css;
+
+ do {
+ parent_css = cgrp->css.parent;
+ cgrp = parent_css ? perf_cgroup_from_css(parent_css) : NULL;
+ } while (cgrp && !perf_cgroup_mon_started(cgrp));
+ return cgrp;
+}
+
+#endif
+
/**
* pmonr_update_sched_rmid() - Update sched_rmid for @pmonr in current package.
*
@@ -815,6 +894,214 @@ static void monr_hrchy_remove_leaf(struct monr *monr)
monr_hrchy_release_locks(&flags);
}

+#ifdef CONFIG_CGROUP_PERF
+
+/* Similar to css_next_descendant_pre but skips the subtree rooted by pos. */
+struct cgroup_subsys_state *
+css_skip_subtree_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
+ }
+ return NULL;
+}
+
+/* Make all monrs of css descendants of css to depend on new_monr. */
+inline void css_subtree_update_monr_dependants(struct cgroup_subsys_state *css,
+ struct monr *new_monr)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct perf_cgroup *pos_cgrp;
+ struct monr *pos_monr;
+ unsigned long flags;
+
+ lockdep_assert_held(&cmt_mutex);
+
+ rcu_read_lock();
+
+ pos_css = css_next_descendant_pre(css, css);
+ while (pos_css) {
+ pos_cgrp = perf_cgroup_from_css(pos_css);
+ pos_monr = monr_from_perf_cgroup(pos_cgrp);
+
+ /* Skip css that are not online, sync'ed with cmt_mutex. */
+ if (!(pos_css->flags & CSS_ONLINE)) {
+ pos_css = css_next_descendant_pre(pos_css, css);
+ continue;
+ }
+ if (!perf_cgroup_mon_started(pos_cgrp)) {
+ perf_cgroup_set_monr(pos_cgrp, new_monr);
+ pos_css = css_next_descendant_pre(pos_css, css);
+ continue;
+ }
+ rcu_read_unlock();
+
+ monr_hrchy_acquire_locks(&flags);
+ pos_monr->parent = new_monr;
+ list_move_tail(&pos_monr->parent_entry, &new_monr->children);
+ monr_hrchy_release_locks(&flags);
+
+ rcu_read_lock();
+ /*
+ * Skip subtrees rooted by a css that owns a monr, since the
+ * css in those subtrees use the monr at their subtree root.
+ */
+ pos_css = css_skip_subtree_pre(pos_css, css);
+ }
+ rcu_read_unlock();
+}
+
+static inline int __css_start_monitoring(struct cgroup_subsys_state *css)
+{
+ struct perf_cgroup *cgrp, *cgrp_lma, *pos_cgrp;
+ struct monr *monr, *monr_parent, *pos_monr, *tmp_monr;
+ unsigned long flags;
+
+ lockdep_assert_held(&cmt_mutex);
+
+ cgrp = perf_cgroup_from_css(css);
+
+ cgrp_lma = perf_cgroup_find_lma(cgrp);
+ if (!cgrp_lma) {
+ perf_cgroup_set_monr(cgrp, monr_hrchy_root);
+ monr_hrchy_root->mon_cgrp = cgrp;
+ return 0;
+ }
+ /*
+ * The monr for the lowest monitored ancestor is direct ancestor
+ * of monr in the monr hierarchy.
+ */
+ monr_parent = monr_from_perf_cgroup(cgrp_lma);
+
+ monr = monr_alloc();
+ if (IS_ERR(monr))
+ return PTR_ERR(monr);
+ /*
+ * New monr has no children yet so it can be inserted in hierarchy as
+ * a leaf. Since all monr's pmonr are in Off state, there is no risk
+ * of pmonr state transitions in the scheduler path.
+ */
+ monr_hrchy_acquire_locks(&flags);
+ monr_hrchy_insert_leaf(monr, monr_parent);
+ monr_hrchy_release_locks(&flags);
+
+ /*
+ * Previous lock also works as a barrier to prevent attaching
+ * the monr to cgrp before it is in monr hierarchy.
+ */
+ perf_cgroup_set_monr(cgrp, monr);
+ monr->mon_cgrp = cgrp;
+ css_subtree_update_monr_dependants(css, monr);
+
+ monr_hrchy_acquire_locks(&flags);
+ /* Move task-event monrs that are descendant from css's cgroup. */
+ list_for_each_entry_safe(pos_monr, tmp_monr,
+ &monr_parent->children, parent_entry) {
+ if (pos_monr->mon_cgrp)
+ continue;
+ /*
+ * all events in event group have the same cgroup.
+ * No RCU read lock necessary for task_css_check since calling
+ * inside critical section.
+ */
+ pos_cgrp = perf_cgroup_from_task_event(pos_monr->mon_events);
+ if (!cgroup_is_descendant(pos_cgrp->css.cgroup,
+ cgrp->css.cgroup))
+ continue;
+ pos_monr->parent = monr;
+ list_move_tail(&pos_monr->parent_entry, &monr->children);
+ }
+ monr_hrchy_release_locks(&flags);
+
+ return 0;
+}
+
+static inline void __css_stop_monitoring(struct cgroup_subsys_state *css)
+{
+ struct perf_cgroup *cgrp, *cgrp_lma;
+ struct monr *monr, *monr_parent, *pos_monr;
+ unsigned long flags;
+
+ lockdep_assert_held(&cmt_mutex);
+
+ cgrp = perf_cgroup_from_css(css);
+ monr = monr_from_perf_cgroup(cgrp);
+ /*
+ * When css is root cgroup's css, detach cgroup but do not
+ * destroy monr.
+ */
+ cgrp_lma = perf_cgroup_find_lma(cgrp);
+ if (!cgrp_lma) {
+ /* monr of root cgrp must be monr_hrchy_root. */
+ monr->mon_cgrp = NULL;
+ return;
+ }
+
+ monr_parent = monr_from_perf_cgroup(cgrp_lma);
+ css_subtree_update_monr_dependants(css, monr_parent);
+
+ monr_hrchy_acquire_locks(&flags);
+
+ /* Move the children monrs that are no cgroups. */
+ list_for_each_entry(pos_monr, &monr->children, parent_entry)
+ pos_monr->parent = monr_parent;
+ list_splice_tail_init(&monr->children, &monr_parent->children);
+
+ perf_cgroup_set_monr(cgrp, monr_from_perf_cgroup(cgrp_lma));
+ monr->mon_cgrp = NULL;
+ monr_hrchy_remove_leaf(monr);
+
+ monr_hrchy_release_locks(&flags);
+}
+
+static bool is_cgroup_event(struct perf_event *event)
+{
+ return event->cgrp;
+}
+
+static int monr_hrchy_attach_cgroup_event(struct perf_event *event)
+{
+ struct monr *monr;
+ struct perf_cgroup *cgrp = event->cgrp;
+ int err;
+ bool started = false;
+
+ if (!perf_cgroup_mon_started(cgrp)) {
+ css_get(&cgrp->css);
+ err = __css_start_monitoring(&cgrp->css);
+ css_put(&cgrp->css);
+ if (err)
+ return err;
+ started = true;
+ }
+
+ monr = monr_from_perf_cgroup(cgrp);
+ err = monr_append_event(monr, event);
+ if (err && started) {
+ css_get(&cgrp->css);
+ __css_stop_monitoring(&cgrp->css);
+ css_put(&cgrp->css);
+ }
+
+ return err;
+}
+
+/* return monr of cgroup that contains the task to monitor. */
+static struct monr *monr_hrchy_get_monr_parent(struct perf_event *event)
+{
+ struct perf_cgroup *cgrp = perf_cgroup_from_task_event(event);
+
+ return monr_from_perf_cgroup(cgrp);
+}
+
+#else /* CONFIG_CGROUP_PERF */
+
static bool is_cgroup_event(struct perf_event *event)
{
return false;
@@ -834,6 +1121,8 @@ static struct monr *monr_hrchy_get_monr_parent(struct perf_event *event)
return monr_hrchy_root;
}

+#endif
+
static int monr_hrchy_attach_cpu_event(struct perf_event *event)
{
return monr_append_event(monr_hrchy_root, event);
@@ -883,6 +1172,10 @@ static int monr_hrchy_attach_event(struct perf_event *event)

static void monr_destroy(struct monr *monr)
{
+#ifdef CONFIG_CGROUP_PERF
+ if (monr->mon_cgrp)
+ __css_stop_monitoring(&monr->mon_cgrp->css);
+#endif
monr_hrchy_remove_leaf(monr);
monr_dealloc(monr);
}
diff --git a/arch/x86/events/intel/cmt.h b/arch/x86/events/intel/cmt.h
index 754a9c8..dc52641 100644
--- a/arch/x86/events/intel/cmt.h
+++ b/arch/x86/events/intel/cmt.h
@@ -252,6 +252,7 @@ enum cmt_user_flags {

/**
* struct monr - MONitored Resource.
+ * @mon_cgrp: The cgroup associated with this monr, if any
* @mon_events: The head of event's group that use this monr, if any.
* @entry: List entry into cmt_event_monrs.
* @pmonrs: Per-package pmonrs.
@@ -271,6 +272,7 @@ enum cmt_user_flags {
* On initialization, all monr's pmonrs start in Off state.
*/
struct monr {
+ struct perf_cgroup *mon_cgrp;
struct perf_event *mon_events;
struct list_head entry;
struct pmonr **pmonrs;
--
2.8.0.rc3.226.g39d4020