[PATCH v0 3/5] perf: Introduce instruction trace filtering

From: Alexander Shishkin
Date: Fri Dec 11 2015 - 08:39:31 EST


Many instruction trace pmus out there support address range-based
filtering, which would, for example, generate trace data only for a
given range of instruction addresses, which is useful for tracing
individual functions, modules or libraries.

This patch introduces the interface for userspace to specify these
filters and for the pmu drivers to apply these filters to hardware
configuration.

The user interface is an ascii string that is passed via an ioctl
and specifies (in a way similar to uprobe) address ranges within
certain object files or within kernel. There is no special treatment
for kernel modules yet, but it might be a worthy pursuit.

The pmu driver interface basically adds an extra callback to the
pmu driver structure, which validates the filter configuration proposed
by the user against what the hardware is actually capable of doing
and translates it into something that pmu::start can program into
hardware.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 40 ++++
kernel/events/core.c | 576 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 613 insertions(+), 3 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f9828a48f1..4ddbedc100 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -127,6 +127,11 @@ struct hw_perf_event {
};
struct { /* itrace */
int itrace_started;
+ /*
+ * PMU would store hardware filter configuration
+ * here.
+ */
+ void *itrace_filters;
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
struct { /* breakpoint */
@@ -388,12 +393,38 @@ struct pmu {
void (*free_aux) (void *aux); /* optional */

/*
+ * Validate instruction tracing filters: make sure hw supports the
+ * requested configuration and number of filters.
+ *
+ * Configure instruction tracing filters: translate hw-agnostic filter
+ * into hardware configuration in event::hw::itrace_filters
+ */
+ int (*itrace_filter_setup) (struct perf_event *event); /* optional */
+
+ /*
* Filter events for PMU-specific reasons.
*/
int (*filter_match) (struct perf_event *event); /* optional */
};

/**
+ * Instruction trace (ITRACE) filter
+ */
+struct perf_itrace_filter {
+ struct list_head entry;
+ struct rcu_head rcu_head;
+ struct inode *inode;
+ struct task_struct *task;
+ unsigned long offset;
+ unsigned long size;
+ unsigned long start;
+ unsigned long end;
+ unsigned int range : 1, /* 1: range, 0: addr */
+ filter : 1, /* 1: filter/start, 0: stop */
+ kernel : 1; /* 1: kernel, 0: object file*/
+};
+
+/**
* enum perf_event_active_state - the states of a event
*/
enum perf_event_active_state {
@@ -559,6 +590,10 @@ struct perf_event {

atomic_t event_limit;

+ /* instruction trace filters */
+ struct list_head itrace_filters;
+ struct mutex itrace_filters_mutex;
+
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;

@@ -1032,6 +1067,11 @@ static inline bool has_aux(struct perf_event *event)
return event->pmu->setup_aux;
}

+static inline bool has_itrace_filter(struct perf_event *event)
+{
+ return event->pmu->itrace_filter_setup;
+}
+
extern int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size);
extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2bab4af901..28ce173a28 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>

#include "internal.h"

@@ -2335,6 +2337,59 @@ static int __perf_event_stop(void *info)
return 0;
}

+static int __perf_event_itrace_filters_setup(void *info)
+{
+ struct perf_event *event = info;
+ int ret;
+
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return -EAGAIN;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * There is a window with interrupts enabled before we get here,
+ * so we need to check again lest we try to stop another cpu's event.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return -EAGAIN;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+ rcu_read_lock();
+ ret = event->pmu->itrace_filter_setup(event);
+ rcu_read_unlock();
+ event->pmu->start(event, PERF_EF_RELOAD);
+
+ return ret;
+}
+
+static int perf_event_itrace_filters_setup(struct perf_event *event)
+{
+ int ret;
+
+ /*
+ * We can't use event_function_call() here, because that would
+ * require ctx::mutex, but one of our callers is called with
+ * mm::mmap_sem down, which would cause an inversion, see bullet
+ * (2) in put_event().
+ */
+ do {
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) {
+ ret = event->pmu->itrace_filter_setup(event);
+ break;
+ }
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ ret = cpu_function_call(READ_ONCE(event->oncpu),
+ __perf_event_itrace_filters_setup, event);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
@@ -3663,6 +3718,8 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}

+static void perf_itrace_filters_clear(struct perf_event *event);
+
static void __free_event(struct perf_event *event)
{
if (!event->parent) {
@@ -3671,6 +3728,7 @@ static void __free_event(struct perf_event *event)
}

perf_event_free_bpf_prog(event);
+ perf_itrace_filters_clear(event);

if (event->destroy)
event->destroy(event);
@@ -5907,6 +5965,37 @@ out:
comm_event->event_id.header.size = size;
}

+/*
+ * Clear all dynamic object-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_itrace_exec(struct perf_event *event, void *data)
+{
+ struct perf_itrace_filter *filter;
+ unsigned int restart = 0;
+
+ if (!has_itrace_filter(event))
+ return;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(filter, &event->itrace_filters, entry) {
+ if (filter->kernel)
+ continue;
+
+ filter->start = filter->end = 0;
+ restart++;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * kernel filters, however, will still be valid
+ */
+ if (restart)
+ perf_event_itrace_filters_setup(event);
+}
+
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
char comm[TASK_COMM_LEN];
@@ -5921,6 +6010,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)

comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

+ if (comm_event->event_id.header.misc == PERF_RECORD_MISC_COMM_EXEC)
+ perf_event_aux(perf_itrace_exec, comm_event, NULL, true);
perf_event_aux(perf_event_comm_output,
comm_event,
NULL, false);
@@ -6159,6 +6250,77 @@ got_name:
kfree(buf);
}

+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_itrace_filter_needs_mmap(struct perf_itrace_filter *filter)
+{
+ return filter->filter && !filter->kernel && !filter->end;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_itrace_filter_match(struct perf_itrace_filter *filter,
+ struct file *file, unsigned long offset,
+ unsigned long size)
+{
+
+ if (filter->inode != file->f_inode)
+ return false;
+
+ if (filter->offset > offset + size)
+ return false;
+
+ if (filter->offset + filter->size < offset)
+ return false;
+
+ return true;
+}
+
+/*
+ * Update event's itrace filters
+ */
+static void perf_itrace_filters_update(struct perf_event *event, void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ unsigned long off = mmap_event->vma->vm_pgoff << PAGE_SHIFT;
+ struct file *file = mmap_event->vma->vm_file;
+ struct perf_itrace_filter *filter;
+ unsigned int restart = 0;
+
+ if (!has_itrace_filter(event))
+ return;
+
+ if (!file)
+ return;
+
+ /* we do not modify the list or sleep, no need for the mutex */
+ rcu_read_lock();
+ list_for_each_entry_rcu(filter, &event->itrace_filters, entry) {
+ if (filter->kernel)
+ continue;
+
+ if (filter->task->mm != mmap_event->vma->vm_mm)
+ continue;
+
+ if (!perf_itrace_filter_match(filter, file, off,
+ mmap_event->event_id.len))
+ continue;
+
+ restart++;
+ filter->start = mmap_event->event_id.start + filter->offset;
+ filter->end = mmap_event->event_id.start + filter->offset +
+ filter->size;
+ }
+ rcu_read_unlock();
+
+ /* reprogram updated filters into hardware */
+ if (restart)
+ perf_event_itrace_filters_setup(event);
+}
+
void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
@@ -6190,6 +6352,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .flags (attr_mmap2 only) */
};

+ perf_event_aux(perf_itrace_filters_update, &mmap_event, NULL, true);
perf_event_mmap_event(&mmap_event);
}

@@ -7163,13 +7326,405 @@ void perf_bp_event(struct perf_event *bp, void *data)
}
#endif

+/*
+ * Insert an itrace @filter into @event's list of filters.
+ * @filter is used as a template
+ */
+static int perf_itrace_filter_insert(struct perf_event *event,
+ struct perf_itrace_filter *src,
+ struct task_struct *task)
+{
+ int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+ struct perf_itrace_filter *filter;
+
+ filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+ if (!filter)
+ return -ENOMEM;
+
+ filter->inode = src->inode;
+ filter->offset = src->offset;
+ filter->size = src->size;
+ filter->range = src->range;
+ filter->filter = src->filter;
+ filter->kernel = src->kernel;
+ /*
+ * We copy the actual address range too: it will remain the same for
+ * kernel addresses and user addresses after fork(); in case of exec
+ * or mmap, it will get cleared or modified by
+ * perf_itrace_filters_clear()/perf_itrace_filters_update().
+ */
+ filter->start = src->start;
+ filter->end = src->end;
+
+ /*
+ * We're already holding a reference to this task_struct since
+ * alloc_perf_context() till last put_ctx() in __free_event().
+ */
+ filter->task = task;
+
+ /*
+ * If we're called through perf_itrace_filters_clone(), we're already
+ * holding parent's filter mutex.
+ */
+ mutex_lock_nested(&event->itrace_filters_mutex, SINGLE_DEPTH_NESTING);
+ list_add_tail_rcu(&filter->entry, &event->itrace_filters);
+ mutex_unlock(&event->itrace_filters_mutex);
+
+ return 0;
+}
+
+static void perf_itrace_filter_free_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_itrace_filter *filter =
+ container_of(rcu_head, struct perf_itrace_filter, rcu_head);
+
+ if (filter->inode)
+ iput(filter->inode);
+ kfree(filter);
+}
+
+/*
+ * we can do this via task_function_call(), as well as setting filters
+ * and maybe event updating them!
+ */
+static void perf_itrace_filters_clear(struct perf_event *event)
+{
+ struct perf_itrace_filter *filter, *iter;
+
+ if (!has_itrace_filter(event))
+ return;
+
+ mutex_lock(&event->itrace_filters_mutex);
+ list_for_each_entry_safe(filter, iter, &event->itrace_filters, entry) {
+ list_del_rcu(&filter->entry);
+ call_rcu(&filter->rcu_head, perf_itrace_filter_free_rcu);
+ }
+
+ perf_event_itrace_filters_setup(event);
+ mutex_unlock(&event->itrace_filters_mutex);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static int perf_itrace_filter_apply(struct perf_event *event,
+ struct perf_itrace_filter *filter,
+ struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma->vm_next; vma = vma->vm_next) {
+ struct file *file = vma->vm_file;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+ if (!file)
+ continue;
+
+ if (!perf_itrace_filter_match(filter, file, off,
+ vma_size))
+ continue;
+
+ filter->start = vma->vm_start + filter->offset;
+ filter->end = vma->vm_start + filter->offset +
+ filter->size;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Adjust event's itrace filters' address ranges based on the
+ * task's existing mappings
+ */
+static void perf_itrace_filters_apply(struct perf_event *event)
+{
+ struct perf_itrace_filter *filter;
+ struct mm_struct *mm = NULL;
+ unsigned int restart = 0;
+
+ lockdep_assert_held(&event->ctx->mutex);
+
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ return;
+
+ down_read(&mm->mmap_sem);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(filter, &event->itrace_filters, entry) {
+ if (!perf_itrace_filter_needs_mmap(filter))
+ continue;
+
+ restart += perf_itrace_filter_apply(event, filter, mm);
+ }
+ rcu_read_unlock();
+
+ up_read(&mm->mmap_sem);
+
+ if (restart)
+ perf_event_itrace_filters_setup(event);
+
+ mmput(mm);
+}
+
+/*
+ * Instruction trace filtering: limiting the trace to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION SOURCE:RANGE_SPEC
+ * where ACTION is one of the
+ * * "filter": limit the trace to this region
+ * * "start": start tracing from this address
+ * * "stop": stop tracing at this address/region;
+ * SOURCE is either "file" or "kernel"
+ * RANGE_SPEC is
+ * * for "kernel": <start address>[/<size>]
+ * * for "file": <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+ IF_ACT_FILTER,
+ IF_ACT_START,
+ IF_ACT_STOP,
+ IF_SRC_FILE,
+ IF_SRC_KERNEL,
+ IF_SRC_FILEADDR,
+ IF_SRC_KERNELADDR,
+};
+
+enum {
+ IF_STATE_ACTION = 0,
+ IF_STATE_SOURCE,
+ IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+ { IF_ACT_FILTER, "filter" },
+ { IF_ACT_START, "start" },
+ { IF_ACT_STOP, "stop" },
+ { IF_SRC_FILE, "file:%u/%u@%s" },
+ { IF_SRC_KERNEL, "kernel:%u/%u" },
+ { IF_SRC_FILEADDR, "file:%u@%s" },
+ { IF_SRC_KERNELADDR, "kernel:%u" },
+};
+
+/*
+ * Itrace filter string parser
+ */
+static int
+perf_event_parse_itrace_filter(struct perf_event *event, char *fstr)
+{
+ struct perf_itrace_filter filter;
+ char *start, *orig, *filename = NULL;
+ struct path path;
+ substring_t args[MAX_OPT_ARGS];
+ int state = IF_STATE_ACTION, token;
+ int ret = -EINVAL;
+
+ orig = fstr = kstrdup(fstr, GFP_KERNEL);
+ if (!fstr)
+ return -ENOMEM;
+
+ while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ ret = -EINVAL;
+
+ if (!*start)
+ continue;
+
+ /* filter definition begins */
+ if (state == IF_STATE_ACTION)
+ memset(&filter, 0, sizeof(filter));
+
+ token = match_token(start, if_tokens, args);
+ switch (token) {
+ case IF_ACT_FILTER:
+ case IF_ACT_START:
+ filter.filter = 1;
+
+ case IF_ACT_STOP:
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ state = IF_STATE_SOURCE;
+ break;
+
+ case IF_SRC_KERNELADDR:
+ case IF_SRC_KERNEL:
+ filter.kernel = 1;
+
+ case IF_SRC_FILEADDR:
+ case IF_SRC_FILE:
+ if (state != IF_STATE_SOURCE)
+ goto fail;
+
+ if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+ filter.range = 1;
+
+ *args[0].to = 0;
+ ret = kstrtoul(args[0].from, 0, &filter.offset);
+ if (ret)
+ goto fail;
+
+ if (filter.range) {
+ *args[1].to = 0;
+ ret = kstrtoul(args[1].from, 0, &filter.size);
+ if (ret)
+ goto fail;
+ }
+
+ if (token == IF_SRC_FILE) {
+ filename = match_strdup(&args[2]);
+ if (!filename) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ state = IF_STATE_END;
+ break;
+
+ default:
+ goto fail;
+ }
+
+ /*
+ * Filter definition is fully parsed, validate and install it.
+ * Make sure that it doesn't contradict itself or the event's
+ * attribute.
+ */
+ if (state == IF_STATE_END) {
+ if (filter.kernel && event->attr.exclude_kernel)
+ goto fail;
+
+ if (!filter.kernel) {
+ if (!filename)
+ goto fail;
+
+ /* look up the path and grab its inode */
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_free_name;
+
+ filter.inode = igrab(d_inode(path.dentry));
+ path_put(&path);
+ kfree(filename);
+ filename = NULL;
+ }
+
+ ret = perf_itrace_filter_insert(event, &filter,
+ event->ctx->task);
+ if (ret)
+ goto fail;
+
+ /* ready to consume more filters */
+ state = IF_STATE_ACTION;
+ }
+ }
+
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ kfree(orig);
+
+ return 0;
+
+fail_free_name:
+ kfree(filename);
+fail:
+ perf_itrace_filters_clear(event);
+ kfree(orig);
+
+ return ret;
+}
+
+/*
+ * Filters are cloned in inherit_event() path to make sure child tracing is
+ * consistent with parent.
+ */
+static int
+perf_itrace_filters_clone(struct perf_event *to, struct perf_event *from,
+ struct task_struct *task)
+{
+ struct perf_itrace_filter *filter;
+ int ret = -ENOMEM;
+
+ mutex_lock(&from->itrace_filters_mutex);
+ list_for_each_entry_rcu(filter, &from->itrace_filters, entry) {
+ /* parent's filter must hold a reference to this inode */
+ if (WARN_ON_ONCE(!igrab(filter->inode)))
+ goto fail;
+
+ ret = perf_itrace_filter_insert(to, filter, task);
+ if (ret) {
+ iput(filter->inode);
+ goto fail;
+ }
+ }
+
+ ret = 0;
+fail:
+ mutex_unlock(&from->itrace_filters_mutex);
+
+ if (!ret)
+ ret = perf_event_itrace_filters_setup(to);
+ else
+ perf_itrace_filters_clear(to);
+
+ return ret;
+}
+
+static int
+perf_event_set_itrace_filter(struct perf_event *event, char *filter_str)
+{
+ int ret = 0;
+
+ /*
+ * Since this is called in perf_ioctl() path, we're already holding
+ * ctx::mutex.
+ */
+ lockdep_assert_held(&event->ctx->mutex);
+
+ /*
+ * For now, we only support filtering in per-task events; doing so
+ * for cpu-wide events requires additional context switching trickery,
+ * since same object code will be mapped at different virtual
+ * addresses in different processes.
+ */
+ if (!event->ctx->task)
+ return -EOPNOTSUPP;
+
+ /* remove existing filters, if any */
+ perf_itrace_filters_clear(event);
+
+ ret = perf_event_parse_itrace_filter(event, filter_str);
+ if (!ret) {
+ perf_itrace_filters_apply(event);
+
+ ret = perf_event_itrace_filters_setup(event);
+ if (ret)
+ perf_itrace_filters_clear(event);
+ }
+
+ return ret;
+}
+
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
int ret = -EINVAL;

- if (event->attr.type != PERF_TYPE_TRACEPOINT ||
- !IS_ENABLED(CONFIG_EVENT_TRACING))
+ if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+ !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+ !has_itrace_filter(event))
return -EINVAL;

filter_str = strndup_user(arg, PAGE_SIZE);
@@ -7180,6 +7735,8 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
event->attr.type == PERF_TYPE_TRACEPOINT)
ret = ftrace_profile_set_filter(event, event->attr.config,
filter_str);
+ else if (has_itrace_filter(event))
+ ret = perf_event_set_itrace_filter(event, filter_str);

kfree(filter_str);
return ret;
@@ -7921,13 +8478,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
+ INIT_LIST_HEAD(&event->itrace_filters);
INIT_HLIST_NODE(&event->hlist_entry);

-
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);

mutex_init(&event->mmap_mutex);
+ mutex_init(&event->itrace_filters_mutex);

atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
@@ -9063,6 +9621,18 @@ inherit_event(struct perf_event *parent_event,
get_ctx(child_ctx);

/*
+ * Clone itrace filters from the parent, if any
+ */
+ if (has_itrace_filter(child_event)) {
+ if (perf_itrace_filters_clone(child_event, parent_event,
+ child)) {
+ put_ctx(child_ctx);
+ free_event(child_event);
+ return NULL;
+ }
+ }
+
+ /*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
* so we won't race with perf_event_{en, dis}able_family.
--
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/