Re: [PATCH v1 4/5] perf: Introduce address range filtering

From: Peter Zijlstra
Date: Fri Apr 22 2016 - 03:46:07 EST


On Thu, Apr 21, 2016 at 06:17:02PM +0300, Alexander Shishkin wrote:
> +struct addr_filter_setup_data {
> + struct perf_event *event;
> + unsigned long *offs;
> + unsigned long gen;
> +};
> +
> +static int __perf_event_addr_filters_setup(void *info)
> +{
> + struct addr_filter_setup_data *id = info;
> + struct perf_event *event = id->event;
> + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
> + unsigned long flags;
> +
> + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
> + return -EAGAIN;
> +
> + /* matches smp_wmb() in event_sched_in() */
> + smp_rmb();
> +
> + /*
> + * There is a window with interrupts enabled before we get here,
> + * so we need to check again lest we try to stop another cpu's event.
> + */
> + if (READ_ONCE(event->oncpu) != smp_processor_id())
> + return -EAGAIN;
> +
> + raw_spin_lock_irqsave(&ifh->lock, flags);

Since we only ever use this from cpu_function_call() IRQs are guaranteed
off already, you even rely on that in the above ->oncpu test, so the
_irqsave() thing is entirely redundant, no?

> + /*
> + * In case of generations' mismatch, we don't have to do anything for
> + * this instance any, there will be another one with the *right* gen.
> + * If called to clear filters, always let it through.
> + */
> + if (id->gen == event->addr_filters_gen || !id->offs)
> + event->pmu->addr_filters_setup(event, id->offs, PERF_EF_RELOAD);
> + raw_spin_unlock_irqrestore(&ifh->lock, flags);
> +
> + return 0;
> +}
> +
> +static int perf_event_addr_filters_setup(struct perf_event *event,
> + unsigned long *offs,
> + unsigned long gen)
> +{
> + struct addr_filter_setup_data id = {
> + .event = event,
> + .offs = offs,
> + .gen = gen,
> + };
> + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
> + unsigned long flags;
> + int ret = 0;
> +
> + /*
> + * We can't use event_function_call() here, because that would
> + * require ctx::mutex, but one of our callers is called with
> + * mm::mmap_sem down, which would cause an inversion, see bullet
> + * (2) in put_event().
> + */
> + do {
> + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) {
> + raw_spin_lock_irqsave(&ifh->lock, flags);

And here, like in all the other sites, IRQs must be enabled, no?

> + /* see __perf_event_addr_filters_setup */

How is this not racy? The event could have gotten enabled between that
test and getting here, right?

> + if (gen == event->addr_filters_gen || !offs)
> + event->pmu->addr_filters_setup(event, offs, 0);
> + raw_spin_unlock_irqrestore(&ifh->lock, flags);
> +
> + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
> + break;

I r confused, calling ->addr_filters_setup() on an active event, from
the wrong cpu, is badness, no?

Is this something the callback has to handle?

> + /* otherwise, fall through to the cross-call */
> + }
> +
> + /* matches smp_wmb() in event_sched_in() */
> + smp_rmb();
> +
> + ret = cpu_function_call(READ_ONCE(event->oncpu),
> + __perf_event_addr_filters_setup, &id);
> + } while (ret == -EAGAIN);
> +
> + return ret;
> +}

This whole thing seems rather full of tricky :/


> @@ -6398,6 +6629,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
> /* .flags (attr_mmap2 only) */
> };
>
> + perf_addr_filters_adjust(vma);
> perf_event_mmap_event(&mmap_event);
> }

And this is the 'offending' site that requires all the tricky..

> +/*
> + * Calculate event's address filters' ranges based on the
> + * task's existing mappings; if any of the existing mappings
> + * match the filters, update event's hw configuration and
> + * restart it if it's running.
> + */
> +static void perf_event_addr_filters_apply(struct perf_event *event)
> +{
> + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
> + struct perf_addr_filter *filter;
> + struct task_struct *task = READ_ONCE(event->ctx->task);
> + struct mm_struct *mm = NULL;
> + unsigned int restart = 0, count = 0;
> + unsigned long *offs, flags, gen;
> +
> + offs = event->hw.addr_filters_offs;
> +
> + /*
> + * We may observe TASK_TOMBSTONE, which means that the event tear-down
> + * will stop on the parent's child_mutex that our caller is also holding
> + */
> + if (task == TASK_TOMBSTONE)
> + return;
> +
> + mm = get_task_mm(event->ctx->task);
> + if (!mm)
> + return;

kernel threads may not have a filter?

> +
> + /* establish the initial hw configuration for this set of filters */
> + perf_event_addr_filters_setup(event, NULL, 0);
> +
> + down_read(&mm->mmap_sem);
> +
> + raw_spin_lock_irqsave(&ifh->lock, flags);
> + list_for_each_entry(filter, &ifh->list, entry) {
> + offs[count] = 0;
> +
> + if (perf_addr_filter_needs_mmap(filter)) {
> + offs[count] = perf_addr_filter_apply(filter, mm);
> +
> + if (offs[count])
> + restart++;
> + }
> +
> + count++;
> + }
> +
> + gen = ++event->addr_filters_gen;
> + raw_spin_unlock_irqrestore(&ifh->lock, flags);
> +
> + up_read(&mm->mmap_sem);
> +
> + if (restart)
> + perf_event_addr_filters_setup(event, offs, gen);
> +
> + mmput(mm);
> +}

> +/*
> + * Address range filtering: limiting the data to certain
> + * instruction address ranges. Filters are ioctl()ed to us from
> + * userspace as ascii strings.
> + *
> + * Filter string format:
> + *
> + * ACTION SOURCE:RANGE_SPEC
> + * where ACTION is one of the
> + * * "filter": limit the trace to this region
> + * * "start": start tracing from this address
> + * * "stop": stop tracing at this address/region;
> + * SOURCE is either "file" or "kernel"
> + * RANGE_SPEC is
> + * * for "kernel": <start address>[/<size>]
> + * * for "file": <start address>[/<size>]@</path/to/object/file>
> + *
> + * if <size> is not specified, the range is treated as a single address.
> + */

Scary bit of string manipulation there.. have you tried fuzzing it? ;-)