[PATCH V4 2/6] perf: attach/detach PMU specific data

From: kan . liang
Date: Wed May 19 2021 - 11:20:01 EST


From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

The LBR call stack data has to be saved/restored during context switch
to fix the shorter LBRs call stacks issue in the system-wide mode.
Allocate PMU specific data and attach them to the corresponding
task_struct during LBR call stack monitoring.

When a LBR call stack event is accounted, the perf_ctx_data for the
related tasks will be allocated/attached by attach_perf_ctx_data().
When a LBR call stack event is unaccounted, the perf_ctx_data for
related tasks will be detached/freed by detach_perf_ctx_data().

The LBR call stack event could be a per-task event or a system-wide
event.
- For a per-task event, perf only allocates the perf_ctx_data for the
current task. If the allocation fails, perf will error out.
- For a system-wide event, perf has to allocate the perf_ctx_data for
both the existing tasks and the upcoming tasks.
The allocation for the existing tasks is done in perf_event_alloc().
The allocation for the new tasks will be done in perf_event_fork().
If any allocation fails, perf doesn't error out for the system-wide
event. A debug message will be dumped to system log instead. LBR
callstack may be cutoff for the task which doesn't have the space
allocated.
- The perf_ctx_data only be freed by the last LBR call stack event.
The number of the per-task events is tracked by refcount of each task.
Since the system-wide events impact all tasks, it's not practical to
go through the whole task list to update the refcount for each
system-wide event. The number of system-wide events is tracked by a
global variable nr_task_data_sys_wide_events.
Introduce a macro TASK_DATA_SYS_WIDE for refcount to indicate the
PMU specific data is used by the system-wide events.

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
---

Changes since V3:
- Rebase for the Arch LBR
- Use kvcalloc to replace kcalloc (Andi)

Changes since V2:
- Remove global spin lock task_data_sys_wide_events_lock
Since the global spin lock has been removed, we cannot guarantee
that the allocation/assignments for existing threads and free are
serialized.
To fix it, in V3, we go through the task list when accounting for
each system-wide event, and assign the perf_ctx_data pointer if needed.
(In V2, we only do the assignment for the first system-wide event).
In V3, we also add a breaker in free process for system-wide event.
If there is new system-wide event accounted, stop the free process
immediately.
- Add a macro TASK_DATA_SYS_WIDE to indicate the PMU specific data
is used by system-wide events.

kernel/events/core.c | 380 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 380 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9bb9bee..bb1b27e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,7 @@
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
@@ -401,6 +402,39 @@ static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
static atomic_t nr_build_id_events __read_mostly;

+/* Track the number of system-wide event which requires pmu specific data */
+static atomic_t nr_task_data_sys_wide_events;
+
+/*
+ * There are two types of users for pmu specific data, system-wide event and
+ * per-task event.
+ *
+ * The number of system-wide events is already tracked by global variable
+ * nr_task_data_sys_wide_events. Set TASK_DATA_SYS_WIDE in refcount to
+ * indicate the PMU specific data is used by system-wide events.
+ *
+ * The number of per-task event users is tracked by refcount. Since the
+ * TASK_DATA_SYS_WIDE is already occupied by system-wide events, limit
+ * the max number of per-task event users less than half of TASK_DATA_SYS_WIDE.
+ */
+#define TASK_DATA_SYS_WIDE 0x1000000
+#define MAX_NR_TASK_DATA_EVENTS (TASK_DATA_SYS_WIDE >> 1)
+
+static inline bool has_task_data_sys_wide(struct perf_ctx_data *perf_ctx_data)
+{
+ return !!(refcount_read(&perf_ctx_data->refcount) & TASK_DATA_SYS_WIDE);
+}
+
+static inline bool exceed_task_data_events_limit(struct perf_ctx_data *perf_ctx_data)
+{
+ unsigned int count = refcount_read(&perf_ctx_data->refcount);
+
+ if (has_task_data_sys_wide(perf_ctx_data))
+ return (count - TASK_DATA_SYS_WIDE) > MAX_NR_TASK_DATA_EVENTS;
+ else
+ return count > MAX_NR_TASK_DATA_EVENTS;
+}
+
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
@@ -4768,6 +4802,288 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}

+static int
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, gfp_t flags,
+ struct perf_ctx_data **task_ctx_data)
+{
+ struct perf_ctx_data *ctx_data;
+
+ if (!ctx_cache)
+ return -EINVAL;
+
+ ctx_data = kzalloc(sizeof(struct perf_ctx_data), flags);
+ if (!ctx_data)
+ return -ENOMEM;
+
+ ctx_data->data = kmem_cache_zalloc(ctx_cache, flags);
+ if (!ctx_data->data) {
+ kfree(ctx_data);
+ return -ENOMEM;
+ }
+
+ ctx_data->ctx_cache = ctx_cache;
+ *task_ctx_data = ctx_data;
+
+ return 0;
+}
+
+static void
+free_perf_ctx_data(struct perf_ctx_data *ctx_data)
+{
+ kfree(ctx_data->data);
+ kfree(ctx_data);
+}
+
+static void
+free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_ctx_data *ctx_data;
+
+ ctx_data = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+ free_perf_ctx_data(ctx_data);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache)
+{
+ struct perf_ctx_data *ctx_data, *tsk_data;
+
+ /*
+ * To make the code RT friendly, make the allocation out of
+ * the spinlock.
+ */
+ if (alloc_perf_ctx_data(ctx_cache, GFP_KERNEL, &ctx_data))
+ return -ENOMEM;
+
+ raw_spin_lock(&task->perf_ctx_data_lock);
+
+ tsk_data = rcu_dereference_protected(task->perf_ctx_data,
+ lockdep_is_held(&task->perf_ctx_data_lock));
+ if (tsk_data) {
+ free_perf_ctx_data(ctx_data);
+ if (WARN_ON_ONCE(exceed_task_data_events_limit(tsk_data))) {
+ raw_spin_unlock(&task->perf_ctx_data_lock);
+ return -EINVAL;
+ }
+ refcount_inc(&tsk_data->refcount);
+ } else {
+ refcount_set(&ctx_data->refcount, 1);
+ /* System-wide event is active as well */
+ if (atomic_read(&nr_task_data_sys_wide_events))
+ refcount_add(TASK_DATA_SYS_WIDE, &ctx_data->refcount);
+
+ rcu_assign_pointer(task->perf_ctx_data, ctx_data);
+ }
+
+ raw_spin_unlock(&task->perf_ctx_data_lock);
+ return 0;
+}
+
+static int
+attach_system_wide_ctx_data(struct kmem_cache *ctx_cache)
+{
+ int i, num_thread, pos, nr_failed_alloc;
+ struct perf_ctx_data *tsk_data;
+ struct perf_ctx_data **data;
+ struct task_struct *g, *p;
+ gfp_t flags = GFP_ATOMIC;
+ bool re_alloc = true;
+
+ /* Retrieve total number of threads */
+ num_thread = nr_threads;
+
+ data = kvcalloc(num_thread, sizeof(*data), GFP_KERNEL);
+ if (!data) {
+ printk_once(KERN_DEBUG
+ "Failed to allocate space for LBR callstack. "
+ "The LBR callstack for all tasks may be cutoff.\n");
+ return -ENOMEM;
+ }
+
+ atomic_inc(&nr_task_data_sys_wide_events);
+
+repeat:
+ /*
+ * Allocate perf_ctx_data for all existing threads.
+ * The perf_ctx_data for new threads will be allocated in
+ * perf_event_fork().
+ * Do a quick allocation in first round with GFP_ATOMIC.
+ */
+ for (i = 0; i < num_thread; i++) {
+ if (alloc_perf_ctx_data(ctx_cache, flags, &data[i]))
+ break;
+ }
+ num_thread = i;
+ nr_failed_alloc = 0;
+ pos = 0;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ raw_spin_lock(&p->perf_ctx_data_lock);
+ tsk_data = p->perf_ctx_data;
+ if (tsk_data) {
+ /*
+ * The perf_ctx_data for this thread may has been
+ * allocated by per-task event.
+ * Only update refcount for the case.
+ */
+ if (!has_task_data_sys_wide(tsk_data))
+ refcount_add(TASK_DATA_SYS_WIDE, &tsk_data->refcount);
+ raw_spin_unlock(&p->perf_ctx_data_lock);
+ continue;
+ }
+
+ if (pos < num_thread) {
+ refcount_set(&data[pos]->refcount, TASK_DATA_SYS_WIDE);
+ rcu_assign_pointer(p->perf_ctx_data, data[pos++]);
+ } else {
+ /*
+ * The quick allocation in first round may be failed.
+ * Track the number in nr_failed_alloc.
+ */
+ nr_failed_alloc++;
+ }
+ raw_spin_unlock(&p->perf_ctx_data_lock);
+ }
+ rcu_read_unlock();
+
+ if (re_alloc && !nr_failed_alloc) {
+ num_thread = nr_failed_alloc;
+ flags = GFP_KERNEL;
+ re_alloc = false;
+ goto repeat;
+ }
+
+ if (nr_failed_alloc) {
+ printk_once(KERN_DEBUG
+ "Failed to allocate space for LBR callstack. "
+ "The LBR callstack for some tasks may be cutoff.\n");
+ }
+
+ for (; pos < num_thread; pos++)
+ free_perf_ctx_data(data[pos]);
+
+ kvfree(data);
+ return 0;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+
+ if (task)
+ return attach_task_ctx_data(task, ctx_cache);
+ else
+ return attach_system_wide_ctx_data(ctx_cache);
+}
+
+/**
+ * detach_task_ctx_data - Detach perf_ctx_data RCU pointer for a task
+ * monitored by per-task event
+ * @task: Target Task
+ * @force: Unconditionally free perf_ctx_data
+ *
+ * If force is set, free perf_ctx_data unconditionally.
+ * Otherwise, free perf_ctx_data when there are no users.
+ * Lock is required to sync the writers of perf_ctx_data RCU pointer
+ */
+static void
+detach_task_ctx_data(struct task_struct *task, bool force)
+{
+ struct perf_ctx_data *ctx_data;
+
+ raw_spin_lock(&task->perf_ctx_data_lock);
+
+ ctx_data = rcu_dereference_protected(task->perf_ctx_data,
+ lockdep_is_held(&task->perf_ctx_data_lock));
+
+ if (!ctx_data)
+ goto unlock;
+
+ if (!force) {
+ WARN_ON_ONCE(refcount_read(&ctx_data->refcount) == TASK_DATA_SYS_WIDE);
+
+ if (!refcount_dec_and_test(&ctx_data->refcount))
+ goto unlock;
+ }
+
+ RCU_INIT_POINTER(task->perf_ctx_data, NULL);
+ call_rcu(&ctx_data->rcu_head, free_perf_ctx_data_rcu);
+
+unlock:
+ raw_spin_unlock(&task->perf_ctx_data_lock);
+}
+
+/**
+ * detach_task_ctx_data_sys_wide - Detach perf_ctx_data RCU pointer for
+ * a task monitored by system-wide event
+ * @task: Target Task
+ *
+ * Free perf_ctx_data when there are no users.
+ */
+static void
+detach_task_ctx_data_sys_wide(struct task_struct *task)
+{
+ struct perf_ctx_data *ctx_data;
+
+ lockdep_assert_held(&task->perf_ctx_data_lock);
+
+ ctx_data = rcu_dereference_protected(task->perf_ctx_data,
+ lockdep_is_held(&task->perf_ctx_data_lock));
+ if (!ctx_data)
+ return;
+
+ WARN_ON_ONCE(!has_task_data_sys_wide(ctx_data));
+
+ if (!refcount_sub_and_test(TASK_DATA_SYS_WIDE, &ctx_data->refcount))
+ return;
+
+ RCU_INIT_POINTER(task->perf_ctx_data, NULL);
+ call_rcu(&ctx_data->rcu_head, free_perf_ctx_data_rcu);
+}
+
+static void detach_system_wide_ctx_data(void)
+{
+ struct task_struct *g, *p;
+
+ if (!atomic_dec_and_test(&nr_task_data_sys_wide_events))
+ return;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ raw_spin_lock(&p->perf_ctx_data_lock);
+
+ /*
+ * A new system-wide event may be attached while freeing
+ * everything for the old event.
+ * If so, stop the free process immediately.
+ * For the freed threads, attach_system_wide_ctx_data()
+ * will re-allocate the space.
+ */
+ if (unlikely(atomic_read(&nr_task_data_sys_wide_events))) {
+ raw_spin_unlock(&p->perf_ctx_data_lock);
+ goto unlock;
+ }
+
+ detach_task_ctx_data_sys_wide(p);
+ raw_spin_unlock(&p->perf_ctx_data_lock);
+ }
+unlock:
+ rcu_read_unlock();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ if (task)
+ detach_task_ctx_data(task, false);
+ else
+ detach_system_wide_ctx_data();
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -4805,6 +5121,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_bpf_events);
if (event->attr.text_poke)
atomic_dec(&nr_text_poke_events);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ detach_perf_ctx_data(event);

if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7841,10 +8159,63 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}

+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct kmem_cache *ctx_cache = NULL;
+ struct perf_ctx_data *ctx_data;
+
+ if (!atomic_read(&nr_task_data_sys_wide_events))
+ return;
+
+ rcu_read_lock();
+ ctx_data = rcu_dereference(parent->perf_ctx_data);
+ if (ctx_data)
+ ctx_cache = ctx_data->ctx_cache;
+ rcu_read_unlock();
+
+ if (!ctx_cache)
+ return;
+
+ if (alloc_perf_ctx_data(ctx_cache, GFP_KERNEL, &ctx_data))
+ return;
+
+ raw_spin_lock(&child->perf_ctx_data_lock);
+
+ if (child->perf_ctx_data) {
+ free_perf_ctx_data(ctx_data);
+ } else {
+ refcount_set(&ctx_data->refcount, TASK_DATA_SYS_WIDE);
+ rcu_assign_pointer(child->perf_ctx_data, ctx_data);
+ }
+
+ /*
+ * System-wide event may be unaccount when attaching the perf_ctx_data.
+ * For example,
+ * CPU A CPU B
+ * perf_event_alloc_task_data():
+ * read(nr_task_data_sys_wide_events)
+ * detach_system_wide_ctx_data()
+ * alloc_perf_ctx_data()
+ * rcu_assign_pointer(perf_ctx_data);
+ *
+ * The perf_ctx_data may never be freed until the task is terminated.
+ */
+ if (unlikely(!atomic_read(&nr_task_data_sys_wide_events)))
+ detach_task_ctx_data_sys_wide(child);
+
+ raw_spin_unlock(&child->perf_ctx_data_lock);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}

/*
@@ -11614,11 +11985,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
goto err_callchain_buffer;

+ if ((event->attach_state & PERF_ATTACH_TASK_DATA) &&
+ attach_perf_ctx_data(event))
+ goto err_task_ctx_data;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);

return event;

+err_task_ctx_data:
+ if (!event->parent && (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ put_callchain_buffers();
err_callchain_buffer:
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
@@ -12696,6 +13074,8 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
+
+ detach_task_ctx_data(child, true);
}

static void perf_free_event(struct perf_event *event,
--
2.7.4