Re: [PATCH v12] perf: Sharing PMU counters across compatible events
From: Song Liu
Date: Mon Apr 20 2020 - 21:13:48 EST
> On Apr 20, 2020, at 4:04 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Tue, Mar 31, 2020 at 12:55:33AM -0700, Song Liu wrote:
>> @@ -1778,6 +1775,246 @@ perf_event_groups_next(struct perf_event *event)
>> event = rb_entry_safe(rb_next(&event->group_node), \
>> typeof(*event), group_node))
>>
>> +static inline bool perf_event_can_share(struct perf_event *event)
>> +{
>> + /* only share hardware counting events */
>> + return !is_sampling_event(event);
>> + return !is_software_event(event) && !is_sampling_event(event);
>> +}
>
> ISTR pointing out before that one of those returns is superfluous.
This is totally my fault. Will fix in next version.
>
>> +/*
>> + * Returns whether the two events can share a PMU counter.
>> + *
>> + * Note: This function does NOT check perf_event_can_share() for
>> + * the two events, they should be checked before this function
>> + */
[...]
>> +
>> + barrier();
>> +
>> + WRITE_ONCE(event->dup_master, event);
>> + event->dup_master = event;
>> +}
>
> OK, set up master_count, dup_count, barrier, set dup_master.
>
>> +/* tear down dup_master, no more sharing for this event */
>> +static void perf_event_exit_dup_master(struct perf_event *event)
>> +{
>> + WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
>> + event->state > PERF_EVENT_STATE_INACTIVE);
>> +
>> + /* restore event->count and event->child_count */
>> + local64_set(&event->count, local64_read(&event->master_count));
>> +
>> + event->dup_active = 0;
>> + WRITE_ONCE(event->dup_master, NULL);
>> +
>> + barrier();
>> +}
>
> XXX ordering doesn't make sense, I think you want the barrier() between
> setting count and dup_master, that way you'll get the correct result out
> of perf_event_count() no matter the timing.
Will fix.
>
>> +#define EVENT_TOMBSTONE ((void *)-1L)
>> +
>> +/*
>> + * sync data count from dup_master to event, called on event_pmu_read()
>> + * and event_pmu_del()
>> + */
>> +static void
>> +event_pmu_read_dup(struct perf_event *event, struct perf_event *master)
>> +{
>> + u64 prev_count, new_count;
>> +
>> + if (master == EVENT_TOMBSTONE)
>> + return;
>> +
>> +again:
>> + prev_count = local64_read(&event->dup_count);
>> + if (master->state > PERF_EVENT_STATE_INACTIVE)
>> + master->pmu->read(master);
>> + new_count = local64_read(&master->count);
>> + if (local64_cmpxchg(&event->dup_count, prev_count, new_count) != prev_count)
>> + goto again;
>> +
>> + if (event == master)
>> + local64_add(new_count - prev_count, &event->master_count);
>> + else
>> + local64_add(new_count - prev_count, &event->count);
>> +}
>> +
>> +/* After adding a event to the ctx, try find compatible event(s). */
>> +static void
>> +perf_event_setup_dup(struct perf_event *event, struct perf_event_context *ctx)
>> +{
>> + struct perf_event *tmp;
>> +
>> + if (event->state < PERF_EVENT_STATE_OFF ||
>> + !perf_event_can_share(event))
>> + return;
>> +
>> + /* look for dup with other events */
>> + list_for_each_entry(tmp, &ctx->event_list, event_entry) {
>> + if (tmp == event ||
>> + !perf_event_can_share(tmp) ||
>> + !perf_event_compatible(event, tmp))
>> + continue;
>> + if (tmp->state < PERF_EVENT_STATE_INACTIVE)
>> + continue;
>> +
>> + /* first dup, pick tmp as the master */
>> + if (!tmp->dup_master) {
>> + if (tmp->state == PERF_EVENT_STATE_ACTIVE)
>> + tmp->pmu->read(tmp);
>> + perf_event_init_dup_master(tmp);
>> + }
>> +
>> + event->dup_master = tmp->dup_master;
>> +
>> + break;
>> + }
>> +}
>> +
>> +/* Remove dup_master for the event */
>> +static void
>> +perf_event_remove_dup(struct perf_event *event, struct perf_event_context *ctx)
>> +
>> +{
>> + struct perf_event *tmp, *new_master;
>> + int dup_count, active_count;
>> + int ret;
>> +
>> + /* no sharing */
>> + if (!event->dup_master)
>> + return;
>> +
>> + WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
>> + event->state > PERF_EVENT_STATE_ENABLED);
>> +
>> + /* this event is not the master */
>> + if (event->dup_master != event) {
>> + event->dup_master = NULL;
>> + return;
>> + }
>> +
>> + active_count = event->dup_active;
>> + if (active_count) {
>> + perf_pmu_disable(event->pmu);
>> + event->pmu->del(event, 0);
>> + }
>> +
>> + /* this event is the master */
>> + dup_count = 0;
>> + new_master = NULL;
>> + list_for_each_entry(tmp, &ctx->event_list, event_entry) {
>> + u64 count;
>> +
>> + if (tmp->dup_master != event || tmp == event)
>> + continue;
>> + if (!new_master) {
>> + new_master = tmp;
>> + list_del_init(&new_master->dup_active_list);
>> + }
>> +
>> + if (tmp->state == PERF_EVENT_STATE_ACTIVE) {
>> + /* sync read from old master */
>> + event_pmu_read_dup(tmp, event);
>> + }
>> + /*
>> + * Flip an active event to a new master; this is tricky
>> + * because for an active event event_pmu_read() can be
>> + * called at any time from NMI context.
>> + *
>> + * This means we need to have ->dup_master and ->dup_count
>> + * consistent at all times. Of course we cannot do two
>> + * writes at once :/
>> + *
>> + * Instead, flip ->dup_master to EVENT_TOMBSTONE, this will
>> + * make event_pmu_read_dup() NOP. Then we can set
>> + * ->dup_count and finally set ->dup_master to the
>> + * new_master to let event_pmu_read_dup() rip.
>> + */
>> + WRITE_ONCE(tmp->dup_master, EVENT_TOMBSTONE);
>> + barrier();
>> +
>> + count = local64_read(&new_master->count);
>> + local64_set(&tmp->dup_count, count);
>> +
>> + if (tmp == new_master)
>> + local64_set(&tmp->master_count, count);
>> + else if (tmp->state == PERF_EVENT_STATE_ACTIVE)
>> + list_move_tail(&tmp->dup_active_list,
>> + &new_master->dup_active_list);
>> +
>> + barrier();
>> + WRITE_ONCE(tmp->dup_master, new_master);
>> + dup_count++;
>> + }
>> +
>> + perf_event_exit_dup_master(event);
>> +
>> + if (!dup_count)
>> + return;
>> +
>> + if (active_count) {
>> + /* copy hardware configure to switch faster */
>> + if (event->pmu->copy_hw_config)
>> + event->pmu->copy_hw_config(event, new_master);
>> +
>> + ret = new_master->pmu->add(new_master, PERF_EF_START);
>> + /*
>> + * Since we just removed the old master (@event), it should be
>> + * impossible to fail to schedule the new master, an identical
>> + * event.
>> + */
>> + WARN_ON_ONCE(ret);
>> + if (new_master->state == PERF_EVENT_STATE_INACTIVE) {
>> + /*
>> + * We don't need to update time, so don't call
>> + * perf_event_set_state().
>> + */
>> + new_master->state = PERF_EVENT_STATE_ENABLED;
>> + }
>> + perf_pmu_enable(new_master->pmu);
>> + }
>> +
>> + if (dup_count == 1) {
>> + /*
>> + * We set up as a master, but there aren't any more duplicates.
>> + * Simply clear ->dup_master, as ->master_count == ->count per
>> + * the above.
>> + */
>> + WRITE_ONCE(new_master->dup_master, NULL);
>> + } else {
>> + new_master->dup_active = active_count;
>> + }
>> +}
>> +
>> /*
>> * Add an event from the lists for its context.
>> * Must be called with ctx->mutex and ctx->lock held.
>
>> @@ -4223,7 +4534,14 @@ static void __perf_event_read(void *info)
>>
>> static inline u64 perf_event_count(struct perf_event *event)
>> {
>> - return local64_read(&event->count) + atomic64_read(&event->child_count);
>> + u64 count;
>> +
>> + if (likely(event->dup_master != event))
>> + count = local64_read(&event->count);
>> + else
>> + count = local64_read(&event->master_count);
>> +
>> + return count + atomic64_read(&event->child_count);
>> }
>
> So lsat time I said something about SMP ordering here. Where did that
> go?
I guess we will need something with EVENT_TOMBSTONE? That's not clean, I guess.
>
> Specifically, without ordering it is possible to observe dup_master and
> dup_count out of order. So while we might then see dup_master, we might
> then also see an old dup_count, which would give 'interesting' effects.
>
> Granted, adding memory barriers all over this will suck :/
Can we somehow guarantee dup_master and the counts sits in the same
cache line? In that way, we can get rid of most of these barriers.
Thanks,
Song