Re: [PATCH v12] perf: Sharing PMU counters across compatible events
From: Peter Zijlstra
Date: Mon Apr 20 2020 - 19:05:00 EST
On Tue, Mar 31, 2020 at 12:55:33AM -0700, Song Liu wrote:
> @@ -1778,6 +1775,246 @@ perf_event_groups_next(struct perf_event *event)
> event = rb_entry_safe(rb_next(&event->group_node), \
> typeof(*event), group_node))
>
> +static inline bool perf_event_can_share(struct perf_event *event)
> +{
> + /* only share hardware counting events */
> + return !is_sampling_event(event);
> + return !is_software_event(event) && !is_sampling_event(event);
> +}
ISTR pointing out before that one of those returns is superfluous.
> +/*
> + * Returns whether the two events can share a PMU counter.
> + *
> + * Note: This function does NOT check perf_event_can_share() for
> + * the two events, they should be checked before this function
> + */
> +static inline bool perf_event_compatible(struct perf_event *event_a,
> + struct perf_event *event_b)
> +{
> + return memcmp(&event_a->attr, &event_b->attr, event_a->attr.size) == 0;
> +}
> +
> +static void perf_event_init_dup_master(struct perf_event *event)
> +{
> + bool is_active = event->state == PERF_EVENT_STATE_ACTIVE;
> + s64 count;
> +
> + WARN_ON_ONCE(event->dup_active != 0);
> + /*
> + * The event sharing scheme allows for duplicate events to be ACTIVE
> + * while the master is not. In order to facilitate this, the master
> + * will be put in the ENABLED state whenever it has active duplicates
> + * but is itself *not* ACTIVE.
> + *
> + * When ENABLED the master event is scheduled, but its counter must
> + * appear stalled. Since the PMU driver updates event->count, the
> + * master must keep a shadow counter for itself, this is
> + * event->master_count.
> + */
> + count = local64_read(&event->count);
> + local64_set(&event->master_count, count);
> +
> + if (is_active) {
> + local64_set(&event->dup_count, count);
> + event->dup_active = 1;
> + }
> +
> + barrier();
> +
> + WRITE_ONCE(event->dup_master, event);
> + event->dup_master = event;
> +}
OK, set up master_count, dup_count, barrier, set dup_master.
> +/* tear down dup_master, no more sharing for this event */
> +static void perf_event_exit_dup_master(struct perf_event *event)
> +{
> + WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
> + event->state > PERF_EVENT_STATE_INACTIVE);
> +
> + /* restore event->count and event->child_count */
> + local64_set(&event->count, local64_read(&event->master_count));
> +
> + event->dup_active = 0;
> + WRITE_ONCE(event->dup_master, NULL);
> +
> + barrier();
> +}
XXX ordering doesn't make sense, I think you want the barrier() between
setting count and dup_master, that way you'll get the correct result out
of perf_event_count() no matter the timing.
> +#define EVENT_TOMBSTONE ((void *)-1L)
> +
> +/*
> + * sync data count from dup_master to event, called on event_pmu_read()
> + * and event_pmu_del()
> + */
> +static void
> +event_pmu_read_dup(struct perf_event *event, struct perf_event *master)
> +{
> + u64 prev_count, new_count;
> +
> + if (master == EVENT_TOMBSTONE)
> + return;
> +
> +again:
> + prev_count = local64_read(&event->dup_count);
> + if (master->state > PERF_EVENT_STATE_INACTIVE)
> + master->pmu->read(master);
> + new_count = local64_read(&master->count);
> + if (local64_cmpxchg(&event->dup_count, prev_count, new_count) != prev_count)
> + goto again;
> +
> + if (event == master)
> + local64_add(new_count - prev_count, &event->master_count);
> + else
> + local64_add(new_count - prev_count, &event->count);
> +}
> +
> +/* After adding a event to the ctx, try find compatible event(s). */
> +static void
> +perf_event_setup_dup(struct perf_event *event, struct perf_event_context *ctx)
> +{
> + struct perf_event *tmp;
> +
> + if (event->state < PERF_EVENT_STATE_OFF ||
> + !perf_event_can_share(event))
> + return;
> +
> + /* look for dup with other events */
> + list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> + if (tmp == event ||
> + !perf_event_can_share(tmp) ||
> + !perf_event_compatible(event, tmp))
> + continue;
> + if (tmp->state < PERF_EVENT_STATE_INACTIVE)
> + continue;
> +
> + /* first dup, pick tmp as the master */
> + if (!tmp->dup_master) {
> + if (tmp->state == PERF_EVENT_STATE_ACTIVE)
> + tmp->pmu->read(tmp);
> + perf_event_init_dup_master(tmp);
> + }
> +
> + event->dup_master = tmp->dup_master;
> +
> + break;
> + }
> +}
> +
> +/* Remove dup_master for the event */
> +static void
> +perf_event_remove_dup(struct perf_event *event, struct perf_event_context *ctx)
> +
> +{
> + struct perf_event *tmp, *new_master;
> + int dup_count, active_count;
> + int ret;
> +
> + /* no sharing */
> + if (!event->dup_master)
> + return;
> +
> + WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
> + event->state > PERF_EVENT_STATE_ENABLED);
> +
> + /* this event is not the master */
> + if (event->dup_master != event) {
> + event->dup_master = NULL;
> + return;
> + }
> +
> + active_count = event->dup_active;
> + if (active_count) {
> + perf_pmu_disable(event->pmu);
> + event->pmu->del(event, 0);
> + }
> +
> + /* this event is the master */
> + dup_count = 0;
> + new_master = NULL;
> + list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> + u64 count;
> +
> + if (tmp->dup_master != event || tmp == event)
> + continue;
> + if (!new_master) {
> + new_master = tmp;
> + list_del_init(&new_master->dup_active_list);
> + }
> +
> + if (tmp->state == PERF_EVENT_STATE_ACTIVE) {
> + /* sync read from old master */
> + event_pmu_read_dup(tmp, event);
> + }
> + /*
> + * Flip an active event to a new master; this is tricky
> + * because for an active event event_pmu_read() can be
> + * called at any time from NMI context.
> + *
> + * This means we need to have ->dup_master and ->dup_count
> + * consistent at all times. Of course we cannot do two
> + * writes at once :/
> + *
> + * Instead, flip ->dup_master to EVENT_TOMBSTONE, this will
> + * make event_pmu_read_dup() NOP. Then we can set
> + * ->dup_count and finally set ->dup_master to the
> + * new_master to let event_pmu_read_dup() rip.
> + */
> + WRITE_ONCE(tmp->dup_master, EVENT_TOMBSTONE);
> + barrier();
> +
> + count = local64_read(&new_master->count);
> + local64_set(&tmp->dup_count, count);
> +
> + if (tmp == new_master)
> + local64_set(&tmp->master_count, count);
> + else if (tmp->state == PERF_EVENT_STATE_ACTIVE)
> + list_move_tail(&tmp->dup_active_list,
> + &new_master->dup_active_list);
> +
> + barrier();
> + WRITE_ONCE(tmp->dup_master, new_master);
> + dup_count++;
> + }
> +
> + perf_event_exit_dup_master(event);
> +
> + if (!dup_count)
> + return;
> +
> + if (active_count) {
> + /* copy hardware configure to switch faster */
> + if (event->pmu->copy_hw_config)
> + event->pmu->copy_hw_config(event, new_master);
> +
> + ret = new_master->pmu->add(new_master, PERF_EF_START);
> + /*
> + * Since we just removed the old master (@event), it should be
> + * impossible to fail to schedule the new master, an identical
> + * event.
> + */
> + WARN_ON_ONCE(ret);
> + if (new_master->state == PERF_EVENT_STATE_INACTIVE) {
> + /*
> + * We don't need to update time, so don't call
> + * perf_event_set_state().
> + */
> + new_master->state = PERF_EVENT_STATE_ENABLED;
> + }
> + perf_pmu_enable(new_master->pmu);
> + }
> +
> + if (dup_count == 1) {
> + /*
> + * We set up as a master, but there aren't any more duplicates.
> + * Simply clear ->dup_master, as ->master_count == ->count per
> + * the above.
> + */
> + WRITE_ONCE(new_master->dup_master, NULL);
> + } else {
> + new_master->dup_active = active_count;
> + }
> +}
> +
> /*
> * Add an event from the lists for its context.
> * Must be called with ctx->mutex and ctx->lock held.
> @@ -4223,7 +4534,14 @@ static void __perf_event_read(void *info)
>
> static inline u64 perf_event_count(struct perf_event *event)
> {
> - return local64_read(&event->count) + atomic64_read(&event->child_count);
> + u64 count;
> +
> + if (likely(event->dup_master != event))
> + count = local64_read(&event->count);
> + else
> + count = local64_read(&event->master_count);
> +
> + return count + atomic64_read(&event->child_count);
> }
So lsat time I said something about SMP ordering here. Where did that
go?
Specifically, without ordering it is possible to observe dup_master and
dup_count out of order. So while we might then see dup_master, we might
then also see an old dup_count, which would give 'interesting' effects.
Granted, adding memory barriers all over this will suck :/