Re: [PATCH v6 1/4] perf: Support PERF_SAMPLE_READ with inherit

From: Namhyung Kim
Date: Thu May 23 2024 - 21:39:17 EST


Hello,

On Tue, May 21, 2024 at 6:30 AM Ben Gainey <ben.gainey@xxxxxxx> wrote:
>
> This change allows events to use PERF_SAMPLE READ with inherit
> so long as PERF_SAMPLE_TID is also set.
>
> In this configuration, an event will be inherited into any
> child processes / threads, allowing convenient profiling of a
> multiprocess or multithreaded application, whilst allowing
> profiling tools to collect per-thread samples, in particular
> of groups of counters.
>
> The read_format field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE
> are changed by this new configuration, but calls to `read()` on the same
> event file descriptor are unaffected and continue to return the
> cumulative total.
>
> Signed-off-by: Ben Gainey <ben.gainey@xxxxxxx>

Acked-by: Namhyung Kim <namhyung@xxxxxxxxxx>

Thanks,
Namhyung

> ---
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 78 ++++++++++++++++++++++++++++----------
> 2 files changed, 58 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d2a15c0c6f8a9..e7eed33c50f16 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -932,6 +932,7 @@ struct perf_event_context {
>
> int nr_task_data;
> int nr_stat;
> + int nr_inherit_read;
> int nr_freq;
> int rotate_disable;
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 724e6d7e128f3..e7c847956ebff 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1767,6 +1767,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
> event = rb_entry_safe(rb_next(&event->group_node), \
> typeof(*event), group_node))
>
> +/*
> + * Does the event attribute request inherit with PERF_SAMPLE_READ
> + */
> +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
> +{
> + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
> +}
> +
> /*
> * Add an event from the lists for its context.
> * Must be called with ctx->mutex and ctx->lock held.
> @@ -1797,6 +1805,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
> ctx->nr_user++;
> if (event->attr.inherit_stat)
> ctx->nr_stat++;
> + if (has_inherit_and_sample_read(&event->attr))
> + ctx->nr_inherit_read++;
>
> if (event->state > PERF_EVENT_STATE_OFF)
> perf_cgroup_event_enable(event, ctx);
> @@ -2021,6 +2031,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
> ctx->nr_user--;
> if (event->attr.inherit_stat)
> ctx->nr_stat--;
> + if (has_inherit_and_sample_read(&event->attr))
> + ctx->nr_inherit_read--;
>
> list_del_rcu(&event->event_entry);
>
> @@ -3529,11 +3541,18 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
> perf_ctx_disable(ctx, false);
>
> /* PMIs are disabled; ctx->nr_pending is stable. */
> - if (local_read(&ctx->nr_pending) ||
> + if (ctx->nr_inherit_read ||
> + next_ctx->nr_inherit_read ||
> + local_read(&ctx->nr_pending) ||
> local_read(&next_ctx->nr_pending)) {
> /*
> * Must not swap out ctx when there's pending
> * events that rely on the ctx->task relation.
> + *
> + * Likewise, when a context contains inherit +
> + * SAMPLE_READ events they should be switched
> + * out using the slow path so that they are
> + * treated as if they were distinct contexts.
> */
> raw_spin_unlock(&next_ctx->lock);
> rcu_read_unlock();
> @@ -4533,11 +4552,19 @@ static void __perf_event_read(void *info)
> raw_spin_unlock(&ctx->lock);
> }
>
> -static inline u64 perf_event_count(struct perf_event *event)
> +static inline u64 perf_event_count_cumulative(struct perf_event *event)
> {
> return local64_read(&event->count) + atomic64_read(&event->child_count);
> }
>
> +static inline u64 perf_event_count(struct perf_event *event, bool self_value_only)
> +{
> + if (self_value_only && has_inherit_and_sample_read(&event->attr))
> + return local64_read(&event->count);
> +
> + return perf_event_count_cumulative(event);
> +}
> +
> static void calc_timer_values(struct perf_event *event,
> u64 *now,
> u64 *enabled,
> @@ -5454,7 +5481,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
> mutex_lock(&event->child_mutex);
>
> (void)perf_event_read(event, false);
> - total += perf_event_count(event);
> + total += perf_event_count_cumulative(event);
>
> *enabled += event->total_time_enabled +
> atomic64_read(&event->child_total_time_enabled);
> @@ -5463,7 +5490,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
>
> list_for_each_entry(child, &event->child_list, child_list) {
> (void)perf_event_read(child, false);
> - total += perf_event_count(child);
> + total += perf_event_count_cumulative(child);
> *enabled += child->total_time_enabled;
> *running += child->total_time_running;
> }
> @@ -5545,14 +5572,14 @@ static int __perf_read_group_add(struct perf_event *leader,
> /*
> * Write {count,id} tuples for every sibling.
> */
> - values[n++] += perf_event_count(leader);
> + values[n++] += perf_event_count_cumulative(leader);
> if (read_format & PERF_FORMAT_ID)
> values[n++] = primary_event_id(leader);
> if (read_format & PERF_FORMAT_LOST)
> values[n++] = atomic64_read(&leader->lost_samples);
>
> for_each_sibling_event(sub, leader) {
> - values[n++] += perf_event_count(sub);
> + values[n++] += perf_event_count_cumulative(sub);
> if (read_format & PERF_FORMAT_ID)
> values[n++] = primary_event_id(sub);
> if (read_format & PERF_FORMAT_LOST)
> @@ -6132,7 +6159,7 @@ void perf_event_update_userpage(struct perf_event *event)
> ++userpg->lock;
> barrier();
> userpg->index = perf_event_index(event);
> - userpg->offset = perf_event_count(event);
> + userpg->offset = perf_event_count_cumulative(event);
> if (userpg->index)
> userpg->offset -= local64_read(&event->hw.prev_count);
>
> @@ -7194,13 +7221,14 @@ void perf_event__output_id_sample(struct perf_event *event,
>
> static void perf_output_read_one(struct perf_output_handle *handle,
> struct perf_event *event,
> - u64 enabled, u64 running)
> + u64 enabled, u64 running,
> + bool from_sample)
> {
> u64 read_format = event->attr.read_format;
> u64 values[5];
> int n = 0;
>
> - values[n++] = perf_event_count(event);
> + values[n++] = perf_event_count(event, from_sample);
> if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> values[n++] = enabled +
> atomic64_read(&event->child_total_time_enabled);
> @@ -7218,8 +7246,9 @@ static void perf_output_read_one(struct perf_output_handle *handle,
> }
>
> static void perf_output_read_group(struct perf_output_handle *handle,
> - struct perf_event *event,
> - u64 enabled, u64 running)
> + struct perf_event *event,
> + u64 enabled, u64 running,
> + bool from_sample)
> {
> struct perf_event *leader = event->group_leader, *sub;
> u64 read_format = event->attr.read_format;
> @@ -7245,7 +7274,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
> (leader->state == PERF_EVENT_STATE_ACTIVE))
> leader->pmu->read(leader);
>
> - values[n++] = perf_event_count(leader);
> + values[n++] = perf_event_count(leader, from_sample);
> if (read_format & PERF_FORMAT_ID)
> values[n++] = primary_event_id(leader);
> if (read_format & PERF_FORMAT_LOST)
> @@ -7260,7 +7289,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
> (sub->state == PERF_EVENT_STATE_ACTIVE))
> sub->pmu->read(sub);
>
> - values[n++] = perf_event_count(sub);
> + values[n++] = perf_event_count(sub, from_sample);
> if (read_format & PERF_FORMAT_ID)
> values[n++] = primary_event_id(sub);
> if (read_format & PERF_FORMAT_LOST)
> @@ -7281,9 +7310,14 @@ static void perf_output_read_group(struct perf_output_handle *handle,
> * The problem is that its both hard and excessively expensive to iterate the
> * child list, not to mention that its impossible to IPI the children running
> * on another CPU, from interrupt/NMI context.
> + *
> + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
> + * counts rather than attempting to accumulate some value across all children on
> + * all cores.
> */
> static void perf_output_read(struct perf_output_handle *handle,
> - struct perf_event *event)
> + struct perf_event *event,
> + bool from_sample)
> {
> u64 enabled = 0, running = 0, now;
> u64 read_format = event->attr.read_format;
> @@ -7301,9 +7335,9 @@ static void perf_output_read(struct perf_output_handle *handle,
> calc_timer_values(event, &now, &enabled, &running);
>
> if (event->attr.read_format & PERF_FORMAT_GROUP)
> - perf_output_read_group(handle, event, enabled, running);
> + perf_output_read_group(handle, event, enabled, running, from_sample);
> else
> - perf_output_read_one(handle, event, enabled, running);
> + perf_output_read_one(handle, event, enabled, running, from_sample);
> }
>
> void perf_output_sample(struct perf_output_handle *handle,
> @@ -7343,7 +7377,7 @@ void perf_output_sample(struct perf_output_handle *handle,
> perf_output_put(handle, data->period);
>
> if (sample_type & PERF_SAMPLE_READ)
> - perf_output_read(handle, event);
> + perf_output_read(handle, event, true);
>
> if (sample_type & PERF_SAMPLE_CALLCHAIN) {
> int size = 1;
> @@ -7944,7 +7978,7 @@ perf_event_read_event(struct perf_event *event,
> return;
>
> perf_output_put(&handle, read_event);
> - perf_output_read(&handle, event);
> + perf_output_read(&handle, event, false);
> perf_event__output_id_sample(event, &handle, &sample);
>
> perf_output_end(&handle);
> @@ -12006,10 +12040,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> local64_set(&hwc->period_left, hwc->sample_period);
>
> /*
> - * We currently do not support PERF_SAMPLE_READ on inherited events.
> + * We do not support PERF_SAMPLE_READ on inherited events unless
> + * PERF_SAMPLE_TID is also selected, which allows inherited events to
> + * collect per-thread samples.
> * See perf_output_read().
> */
> - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
> + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
> goto err_ns;
>
> if (!has_branch_stack(event))
> @@ -13033,7 +13069,7 @@ static void sync_child_event(struct perf_event *child_event)
> perf_event_read_event(child_event, task);
> }
>
> - child_val = perf_event_count(child_event);
> + child_val = perf_event_count_cumulative(child_event);
>
> /*
> * Add back the child's count to the parent's count:
> --
> 2.45.1
>