[PATCH 06/12] x86/cqm: Add cgroup hierarchical monitoring support

From: Vikas Shivappa
Date: Fri Jan 06 2017 - 17:41:05 EST


From: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>

Patch adds support for monitoring cgroup hierarchy. The
arch_info that was introduced in the perf_cgroup is used to maintain the
cgroup related rmid and hierarchy information.

Since cgroup supports hierarchical monitoring, a cgroup is always
monitoring for some ancestor. By default root is always monitored with
RMID 0 and hence when any cgroup is first created it always reports data
to the root. mfa or 'monitor for ancestor' is used to keep track of
this information. Basically which ancestor the cgroup is actually
monitoring for or has to report the data to.

By default, all cgroup's mfa points to root.
1.event init: When ever a new cgroup x would start to be monitored,
the mfa of the monitored cgroup's descendants point towards the cgroup
x.
2.switch_to: task finds the cgroup its associated with and if the
cgroup itself is being monitored cgroup uses its own rmid(a) else it uses
the rmid of the mfa(b).
3.read: During the read call, the cgroup x just adds the
counts of its descendants who had cgroup x as mfa and were also
monitored(To count the scenario (a) in switch_to).

Locking: cgroup traversal: rcu_readlock. cgroup->arch_info: css_alloc,
css_free, event terminate, init hold mutex.

Tests: Cgroup monitoring should work. Monitoring multiple cgroups in the
same hierarchy works. monitoring cgroup and a task within same cgroup
doesnt work yet.

Patch modified/refactored by Vikas Shivappa
<vikas.shivappa@xxxxxxxxxxxxxxx> to support recycling removal.

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
arch/x86/events/intel/cqm.c | 227 +++++++++++++++++++++++++++-----
arch/x86/include/asm/intel_rdt_common.h | 64 +++++++++
2 files changed, 257 insertions(+), 34 deletions(-)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index a9bd7bd..c6479ae 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -85,6 +85,7 @@ struct sample {
static cpumask_t cqm_cpumask;

struct pkg_data **cqm_pkgs_data;
+struct cgrp_cqm_info cqm_rootcginfo;

#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
@@ -193,6 +194,11 @@ static void __put_rmid(u32 rmid, int domain)
list_add_tail(&entry->list, &cqm_pkgs_data[domain]->cqm_rmid_limbo_lru);
}

+static bool is_task_event(struct perf_event *e)
+{
+ return (e->attach_state & PERF_ATTACH_TASK);
+}
+
static void cqm_cleanup(void)
{
int i;
@@ -209,7 +215,6 @@ static void cqm_cleanup(void)
kfree(cqm_pkgs_data);
}

-
/*
* Determine if @a and @b measure the same set of tasks.
*
@@ -224,20 +229,18 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
return false;

#ifdef CONFIG_CGROUP_PERF
- if (a->cgrp != b->cgrp)
- return false;
-#endif
-
- /* If not task event, we're machine wide */
- if (!(b->attach_state & PERF_ATTACH_TASK))
+ if ((is_cgroup_event(a) && is_cgroup_event(b)) &&
+ (a->cgrp == b->cgrp))
return true;
+#endif

/*
* Events that target same task are placed into the same cache group.
* Mark it as a multi event group, so that we update ->count
* for every event rather than just the group leader later.
*/
- if (a->hw.target == b->hw.target) {
+ if ((is_task_event(a) && is_task_event(b)) &&
+ (a->hw.target == b->hw.target)) {
b->hw.is_group_event = true;
return true;
}
@@ -365,6 +368,63 @@ static void init_mbm_sample(u32 *rmid, u32 evt_type)
on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
}

+static inline void cqm_enable_mon(struct cgrp_cqm_info *cqm_info, u32 *rmid)
+{
+ if (rmid != NULL) {
+ cqm_info->mon_enabled = true;
+ cqm_info->rmid = rmid;
+ } else {
+ cqm_info->mon_enabled = false;
+ cqm_info->rmid = NULL;
+ }
+}
+
+static void cqm_assign_hier_rmid(struct cgroup_subsys_state *rcss, u32 *rmid)
+{
+ struct cgrp_cqm_info *ccqm_info, *rcqm_info;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+
+ rcqm_info = css_to_cqm_info(rcss);
+
+ /* Enable or disable monitoring based on rmid.*/
+ cqm_enable_mon(rcqm_info, rmid);
+
+ pos_css = css_next_descendant_pre(rcss, rcss);
+ while (pos_css) {
+ ccqm_info = css_to_cqm_info(pos_css);
+
+ /*
+ * Monitoring is being enabled.
+ * Update the descendents to monitor for you, unless
+ * they were already monitoring for a descendent of yours.
+ */
+ if (rmid && (rcqm_info->level > ccqm_info->mfa->level))
+ ccqm_info->mfa = rcqm_info;
+
+ /*
+ * Monitoring is being disabled.
+ * Update the descendents who were monitoring for you
+ * to monitor for the ancestor you were monitoring.
+ */
+ if (!rmid && (ccqm_info->mfa == rcqm_info))
+ ccqm_info->mfa = rcqm_info->mfa;
+ pos_css = css_next_descendant_pre(pos_css, rcss);
+ }
+ rcu_read_unlock();
+}
+
+static int cqm_assign_rmid(struct perf_event *event, u32 *rmid)
+{
+#ifdef CONFIG_CGROUP_PERF
+ if (is_cgroup_event(event)) {
+ cqm_assign_hier_rmid(&event->cgrp->css, rmid);
+ }
+#endif
+ return 0;
+}
+
/*
* Find a group and setup RMID.
*
@@ -402,11 +462,14 @@ static int intel_cqm_setup_event(struct perf_event *event,
return 0;
}

+static u64 cqm_read_subtree(struct perf_event *event, struct rmid_read *rr);
+
static void intel_cqm_event_read(struct perf_event *event)
{
- unsigned long flags;
- u32 rmid;
- u64 val;
+ struct rmid_read rr = {
+ .evt_type = event->attr.config,
+ .value = ATOMIC64_INIT(0),
+ };

/*
* Task events are handled by intel_cqm_event_count().
@@ -414,26 +477,9 @@ static void intel_cqm_event_read(struct perf_event *event)
if (event->cpu == -1)
return;

- raw_spin_lock_irqsave(&cache_lock, flags);
- rmid = event->hw.cqm_rmid[pkg_id];
-
- if (!__rmid_valid(rmid))
- goto out;
-
- if (is_mbm_event(event->attr.config))
- val = rmid_read_mbm(rmid, event->attr.config);
- else
- val = __rmid_read(rmid);
-
- /*
- * Ignore this reading on error states and do not update the value.
- */
- if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- goto out;
+ rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);

- local64_set(&event->count, val);
-out:
- raw_spin_unlock_irqrestore(&cache_lock, flags);
+ cqm_read_subtree(event, &rr);
}

static void __intel_cqm_event_count(void *info)
@@ -545,6 +591,55 @@ static void mbm_hrtimer_init(void)
}
}

+static void cqm_mask_call_local(struct rmid_read *rr)
+{
+ if (is_mbm_event(rr->evt_type))
+ __intel_mbm_event_count(rr);
+ else
+ __intel_cqm_event_count(rr);
+}
+
+static inline void
+ delta_local(struct perf_event *event, struct rmid_read *rr, u32 *rmid)
+{
+ atomic64_set(&rr->value, 0);
+ rr->rmid = ACCESS_ONCE(rmid);
+
+ cqm_mask_call_local(rr);
+ local64_add(atomic64_read(&rr->value), &event->count);
+}
+
+/*
+ * Since cgroup follows hierarchy, add the count of
+ * the descendents who were being monitored as well.
+ */
+static u64 cqm_read_subtree(struct perf_event *event, struct rmid_read *rr)
+{
+#ifdef CONFIG_CGROUP_PERF
+
+ struct cgroup_subsys_state *rcss, *pos_css;
+ struct cgrp_cqm_info *ccqm_info;
+
+ cqm_mask_call_local(rr);
+ local64_set(&event->count, atomic64_read(&(rr->value)));
+
+ if (is_task_event(event))
+ return __perf_event_count(event);
+
+ rcu_read_lock();
+ rcss = &event->cgrp->css;
+ css_for_each_descendant_pre(pos_css, rcss) {
+ ccqm_info = (css_to_cqm_info(pos_css));
+
+ /* Add the descendent 'monitored cgroup' counts */
+ if (pos_css != rcss && ccqm_info->mon_enabled)
+ delta_local(event, rr, ccqm_info->rmid);
+ }
+ rcu_read_unlock();
+#endif
+ return __perf_event_count(event);
+}
+
static u64 intel_cqm_event_count(struct perf_event *event)
{
struct rmid_read rr = {
@@ -603,7 +698,7 @@ void alloc_needed_pkg_rmid(u32 *cqm_rmid)
if (WARN_ON(!cqm_rmid))
return;

- if (cqm_rmid[pkg_id])
+ if (cqm_rmid == cqm_rootcginfo.rmid || cqm_rmid[pkg_id])
return;

raw_spin_lock_irqsave(&cache_lock, flags);
@@ -661,9 +756,11 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
__put_rmid(rmid[d], d);
}
kfree(event->hw.cqm_rmid);
+ cqm_assign_rmid(event, NULL);
list_del(&event->hw.cqm_groups_entry);
}
-static void intel_cqm_event_destroy(struct perf_event *event)
+
+static void intel_cqm_event_terminate(struct perf_event *event)
{
struct perf_event *group_other = NULL;
unsigned long flags;
@@ -917,6 +1014,7 @@ static int intel_cqm_event_init(struct perf_event *event)
.attr_groups = intel_cqm_attr_groups,
.task_ctx_nr = perf_sw_context,
.event_init = intel_cqm_event_init,
+ .event_terminate = intel_cqm_event_terminate,
.add = intel_cqm_event_add,
.del = intel_cqm_event_stop,
.start = intel_cqm_event_start,
@@ -924,12 +1022,67 @@ static int intel_cqm_event_init(struct perf_event *event)
.read = intel_cqm_event_read,
.count = intel_cqm_event_count,
};
+
#ifdef CONFIG_CGROUP_PERF
int perf_cgroup_arch_css_alloc(struct cgroup_subsys_state *parent_css,
struct cgroup_subsys_state *new_css)
-{}
+{
+ struct cgrp_cqm_info *cqm_info, *pcqm_info;
+ struct perf_cgroup *new_cgrp;
+
+ if (!parent_css) {
+ cqm_rootcginfo.level = 0;
+
+ cqm_rootcginfo.mon_enabled = true;
+ cqm_rootcginfo.cont_mon = true;
+ cqm_rootcginfo.mfa = NULL;
+ INIT_LIST_HEAD(&cqm_rootcginfo.tskmon_rlist);
+
+ if (new_css) {
+ new_cgrp = css_to_perf_cgroup(new_css);
+ new_cgrp->arch_info = &cqm_rootcginfo;
+ }
+ return 0;
+ }
+
+ mutex_lock(&cache_mutex);
+
+ new_cgrp = css_to_perf_cgroup(new_css);
+
+ cqm_info = kzalloc(sizeof(struct cgrp_cqm_info), GFP_KERNEL);
+ if (!cqm_info) {
+ mutex_unlock(&cache_mutex);
+ return -ENOMEM;
+ }
+
+ pcqm_info = (css_to_cqm_info(parent_css));
+ cqm_info->level = pcqm_info->level + 1;
+ cqm_info->rmid = pcqm_info->rmid;
+
+ cqm_info->cont_mon = false;
+ cqm_info->mon_enabled = false;
+ INIT_LIST_HEAD(&cqm_info->tskmon_rlist);
+ if (!pcqm_info->mfa)
+ cqm_info->mfa = pcqm_info;
+ else
+ cqm_info->mfa = pcqm_info->mfa;
+
+ new_cgrp->arch_info = cqm_info;
+ mutex_unlock(&cache_mutex);
+
+ return 0;
+}
+
void perf_cgroup_arch_css_free(struct cgroup_subsys_state *css)
-{}
+{
+ struct perf_cgroup *cgrp = css_to_perf_cgroup(css);
+
+ mutex_lock(&cache_mutex);
+ kfree(cgrp_to_cqm_info(cgrp));
+ cgrp->arch_info = NULL;
+ mutex_unlock(&cache_mutex);
+}
+
void perf_cgroup_arch_attach(struct cgroup_taskset *tset)
{}
int perf_cgroup_arch_can_attach(struct cgroup_taskset *tset)
@@ -1053,6 +1206,12 @@ static int pkg_data_init_cpu(int cpu)
entry = __rmid_entry(0, curr_pkgid);
list_del(&entry->list);

+ cqm_rootcginfo.rmid = kzalloc(sizeof(u32) * cqm_socket_max, GFP_KERNEL);
+ if (!cqm_rootcginfo.rmid) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
return 0;
fail:
kfree(ccqm_rmid_ptrs);
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
index b31081b..e11ed5e 100644
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -24,4 +24,68 @@ struct intel_pqr_state {

DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);

+/**
+ * struct cgrp_cqm_info - perf_event cgroup metadata for cqm
+ * @cont_mon Continuous monitoring flag
+ * @mon_enabled Whether monitoring is enabled
+ * @level Level in the cgroup tree. Root is level 0.
+ * @rmid The rmids of the cgroup.
+ * @mfa 'Monitoring for ancestor' points to the cqm_info
+ * of the ancestor the cgroup is monitoring for. 'Monitoring for ancestor'
+ * means you will use an ancestors RMID at sched_in if you are
+ * not monitoring yourself.
+ *
+ * Due to the hierarchical nature of cgroups, every cgroup just
+ * monitors for the 'nearest monitored ancestor' at all times.
+ * Since root cgroup is always monitored, all descendents
+ * at boot time monitor for root and hence all mfa points to root except
+ * for root->mfa which is NULL.
+ * 1. RMID setup: When cgroup x start monitoring:
+ * for each descendent y, if y's mfa->level < x->level, then
+ * y->mfa = x. (Where level of root node = 0...)
+ * 2. sched_in: During sched_in for x
+ * if (x->mon_enabled) choose x->rmid
+ * else choose x->mfa->rmid.
+ * 3. read: for each descendent of cgroup x
+ * if (x->monitored) count += rmid_read(x->rmid).
+ * 4. evt_destroy: for each descendent y of x, if (y->mfa == x) then
+ * y->mfa = x->mfa. Meaning if any descendent was monitoring for x,
+ * set that descendent to monitor for the cgroup which x was monitoring for.
+ *
+ * @tskmon_rlist List of tasks being monitored in the cgroup
+ * When a task which belongs to a cgroup x is being monitored, it always uses
+ * its own task->rmid even if cgroup x is monitored during sched_in.
+ * To account for the counts of such tasks, cgroup keeps this list
+ * and parses it during read.
+ *
+ * Perf handles hierarchy for other events, but because RMIDs are per pkg
+ * this is handled here.
+*/
+struct cgrp_cqm_info {
+ bool cont_mon;
+ bool mon_enabled;
+ int level;
+ u32 *rmid;
+ struct cgrp_cqm_info *mfa;
+ struct list_head tskmon_rlist;
+};
+
+struct tsk_rmid_entry {
+ u32 *rmid;
+ struct list_head list;
+};
+
+#ifdef CONFIG_CGROUP_PERF
+
+# define css_to_perf_cgroup(css_) container_of(css_, struct perf_cgroup, css)
+# define cgrp_to_cqm_info(cgrp_) ((struct cgrp_cqm_info *)cgrp_->arch_info)
+# define css_to_cqm_info(css_) cgrp_to_cqm_info(css_to_perf_cgroup(css_))
+
+#else
+
+# define css_to_perf_cgroup(css_) NULL
+# define cgrp_to_cqm_info(cgrp_) NULL
+# define css_to_cqm_info(css_) NULL
+
+#endif
#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
--
1.9.1