+static int
+attach_system_wide_ctx_data(size_t ctx_size)
+{
+ int i, num_thread, pos, nr_failed_alloc;
+ unsigned long flags = GFP_ATOMIC;
+ struct perf_ctx_data *tsk_data;
+ struct perf_ctx_data **data;
+ struct task_struct *g, *p;
+ bool re_alloc = true;
+
+ /* Retrieve total number of threads */
+ num_thread = nr_threads;
+
+ data = kcalloc(num_thread, sizeof(*data), GFP_KERNEL);
This probably needs kvcalloc for reliability and avoiding stalls.
+ if (!data) {
+ printk_once(KERN_DEBUG
+ "Failed to allocate space for LBR callstack. "
+ "The LBR callstack for all tasks may be cutoff.\n");
+ return -ENOMEM;
+ }
+
+ atomic_inc(&nr_task_data_sys_wide_events);
+
+repeat:
+ /*
+ * Allocate perf_ctx_data for all existing threads.
+ * The perf_ctx_data for new threads will be allocated in
+ * perf_event_fork().
+ * Do a quick allocation in first round with GFP_ATOMIC.
+ */
+ for (i = 0; i < num_thread; i++) {
+ if (alloc_perf_ctx_data(ctx_size, flags, &data[i]))
+ break;
+ }
+ num_thread = i;
+ nr_failed_alloc = 0;
+ pos = 0;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ raw_spin_lock(&p->perf_ctx_data_lock);
+ tsk_data = p->perf_ctx_data;
+ if (tsk_data) {
That will be a lot of locks even for tasks that don't use perf, but I guess we
really need it and it's bounded by the number of tasks.
+ }
+
+ if (pos < num_thread) {
+ refcount_set(&data[pos]->refcount, TASK_DATA_SYS_WIDE);
+ rcu_assign_pointer(p->perf_ctx_data, data[pos++]);
+ } else {
+ /*
+ * The quick allocation in first round may be failed.
+ * Track the number in nr_failed_alloc.
+ */
+ nr_failed_alloc++;
+ }
+ raw_spin_unlock(&p->perf_ctx_data_lock);
+ }
+ rcu_read_unlock();
-Andi