[patch] trace: Add user-space event tracing/injection

From: Ingo Molnar
Date: Wed Nov 17 2010 - 07:08:11 EST



* Pekka Enberg <penberg@xxxxxxxxxx> wrote:

> (Trimming CC for sanity.)

[ Added back Linus (in case he wants to object to the new prctl()) and lkml. ]

> On 11/16/10 11:04 PM, Thomas Gleixner wrote:
> >'trace' is our shot at improving the situation: it aims at providing a
> >simple to use and straightforward tracing tool based on the perf
> >infrastructure and on the well-known perf profiling workflow:
> >
> > trace record firefox # trace firefox and all child tasks,
> > put it into trace.data
> >
> > trace summary # show quick overview of the trace,
> > based on trace.data
> >
> > trace report # detailed traces, based on trace.data
>
> Nice work guys!
>
> Does this concept lend itself to tracing latencies in userspace applications that
> run in virtual machines (e.g. the Java kind)? I'm of course interested in this
> because of Jato [1] where bunch of interesting things can cause jitter: JIT
> compilation, GC, kernel, and the actual application doing something (in either
> native code or JIT'd code). It's important to be able to measure where "slowness"
> to desktop applications and certain class of server applications comes from to be
> able to improve things.

Makes quite a bit of sense.

How about the attached patch? It works fine with the simple testcase included in the
changelog. There's a common-sense limit on the message size - but otherwise it adds
support for apps to generate a free-form string trace event.

Thanks,

Ingo

---------------------------------->
Subject: trace: Add user-space event tracing/injection
From: Ingo Molnar <mingo@xxxxxxx>
Date: Wed Nov 17 10:11:53 CET 2010

This feature (suggested by Darren Hart and Pekka Engberg) allows user-space
programs to print trace events in a very simple and self-contained way:

#include <sys/prctl.h>
#include <string.h>

#define PR_TASK_PERF_USER_TRACE 35

int main(void)
{
char *msg = "Hello World!\n";

prctl(PR_TASK_PERF_USER_TRACE, msg, strlen(msg));

return 0;
}

These show up in 'trace' output as:

$ trace report
#
# trace events of 'sleep 1':
#
testit/ 6006 ( 0.002 ms): <"Hello World!">
testit/ 6006 ( 0.002 ms): <"Hello World!">

Suggested-by: Darren Hart <dvhart@xxxxxxxxxxxxxxx>
Suggested-by: Pekka Enberg <penberg@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
---
include/linux/prctl.h | 2 ++
include/trace/events/user.h | 32 ++++++++++++++++++++++++++++++++
kernel/sys.c | 23 +++++++++++++++++++++++
kernel/trace/trace_events.c | 34 +++++++++++-----------------------
tools/perf/builtin-trace.c | 41 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 109 insertions(+), 23 deletions(-)

Index: linux/include/linux/prctl.h
===================================================================
--- linux.orig/include/linux/prctl.h
+++ linux/include/linux/prctl.h
@@ -102,4 +102,6 @@

#define PR_MCE_KILL_GET 34

+#define PR_TASK_PERF_USER_TRACE 35
+
#endif /* _LINUX_PRCTL_H */
Index: linux/include/trace/events/user.h
===================================================================
--- /dev/null
+++ linux/include/trace/events/user.h
@@ -0,0 +1,32 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM user
+
+#if !defined(_TRACE_USER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_USER_H_
+
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+TRACE_EVENT(user,
+
+ TP_PROTO(const char *message),
+
+ TP_ARGS(message),
+
+ TP_STRUCT__entry(
+ __string( message, message);
+ ),
+
+ TP_fast_assign(
+ __assign_str(message, message);
+ ),
+
+ TP_printk("user %s", __get_str(message))
+);
+
+#undef NO_DEV
+
+#endif /* _TRACE_USER_H_ */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
Index: linux/kernel/sys.c
===================================================================
--- linux.orig/kernel/sys.c
+++ linux/kernel/sys.c
@@ -47,6 +47,11 @@
#include <asm/io.h>
#include <asm/unistd.h>

+#define MAX_USER_TRACE_SIZE 128
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/user.h>
+
#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a,b) (-EINVAL)
#endif
@@ -1681,6 +1686,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
case PR_TASK_PERF_EVENTS_ENABLE:
error = perf_event_task_enable();
break;
+ /*
+ * Inject a trace event into the current tracing context:
+ */
+ case PR_TASK_PERF_USER_TRACE:
+ {
+ void __user *uevent_ptr = (void *)arg2;
+ char kstring[MAX_USER_TRACE_SIZE+1];
+ unsigned long uevent_len = arg3;
+
+ if (uevent_len > MAX_USER_TRACE_SIZE)
+ return -EINVAL;
+ if (copy_from_user(kstring, uevent_ptr, uevent_len))
+ return -EFAULT;
+ kstring[uevent_len] = 0;
+
+ trace_user(kstring);
+ return 0;
+ }
case PR_GET_TIMERSLACK:
error = current->timer_slack_ns;
break;
Index: linux/kernel/trace/trace_events.c
===================================================================
--- linux.orig/kernel/trace/trace_events.c
+++ linux/kernel/trace/trace_events.c
@@ -989,8 +989,7 @@ static struct dentry *event_trace_events

d_events = debugfs_create_dir("events", d_tracer);
if (!d_events)
- pr_warning("Could not create debugfs "
- "'events' directory\n");
+ pr_warning("Could not create debugfs 'events' directory\n");

return d_events;
}
@@ -1014,15 +1013,13 @@ event_subsystem_dir(const char *name, st
/* need to create new entry */
system = kmalloc(sizeof(*system), GFP_KERNEL);
if (!system) {
- pr_warning("No memory to create event subsystem %s\n",
- name);
+ pr_warning("No memory to create event subsystem %s\n", name);
return d_events;
}

system->entry = debugfs_create_dir(name, d_events);
if (!system->entry) {
- pr_warning("Could not create event subsystem %s\n",
- name);
+ pr_warning("Could not create event subsystem %s\n", name);
kfree(system);
return d_events;
}
@@ -1041,8 +1038,7 @@ event_subsystem_dir(const char *name, st

system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
if (!system->filter) {
- pr_warning("Could not allocate filter for subsystem "
- "'%s'\n", name);
+ pr_warning("Could not allocate filter for subsystem '%s'\n", name);
return system->entry;
}

@@ -1051,8 +1047,7 @@ event_subsystem_dir(const char *name, st
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs "
- "'%s/filter' entry\n", name);
+ pr_warning("Could not create debugfs '%s/filter' entry\n", name);
}

trace_create_file("enable", 0644, system->entry,
@@ -1081,19 +1076,16 @@ event_create_dir(struct ftrace_event_cal

call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
- pr_warning("Could not create debugfs "
- "'%s' directory\n", call->name);
+ pr_warning("Could not create debugfs '%s' directory\n", call->name);
return -1;
}

if (call->class->reg)
- trace_create_file("enable", 0644, call->dir, call,
- enable);
+ trace_create_file("enable", 0644, call->dir, call, enable);

#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, call->dir, call,
- id);
+ trace_create_file("id", 0444, call->dir, call, id);
#endif

/*
@@ -1104,16 +1096,12 @@ event_create_dir(struct ftrace_event_cal
if (list_empty(head)) {
ret = call->class->define_fields(call);
if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
+ pr_warning("Could not initialize trace point events/%s\n", call->name);
return ret;
}
}
- trace_create_file("filter", 0644, call->dir, call,
- filter);
-
- trace_create_file("format", 0444, call->dir, call,
- format);
+ trace_create_file("filter", 0644, call->dir, call, filter);
+ trace_create_file("format", 0444, call->dir, call, format);

return 0;
}
Index: linux/tools/perf/builtin-trace.c
===================================================================
--- linux.orig/tools/perf/builtin-trace.c
+++ linux/tools/perf/builtin-trace.c
@@ -719,6 +719,44 @@ static void vfs_getname(void *data,
}
}

+static void user_event(void *data,
+ struct event *event __used,
+ int cpu __used,
+ u64 timestamp __used,
+ struct thread *thread __used)
+{
+ struct thread_data *tdata = get_thread_data(thread);
+ unsigned int i;
+ char *message;
+ u64 t = 0;
+
+ if (!tdata->enabled)
+ return;
+ if (time_filtered(timestamp))
+ return;
+ if (cpu_filtered(cpu))
+ return;
+ if (tdata->entry_time) {
+ t = timestamp - tdata->entry_time;
+ if (filter_duration(t))
+ return;
+ } else if (duration_filter)
+ return;
+
+ print_entry_head(thread, timestamp, t, cpu);
+ message = strdup(raw_field_ptr(event, "message", data));
+
+ /* Sanitize the string a bit before printing it: */
+ for (i = 0; i < strlen(message); i++) {
+ if (message[i] == '\n')
+ message[i] = ' ';
+ }
+
+ color_fprintf(stdout, PERF_COLOR_GREEN, "<\"%s\">\n", message);
+
+ free(message);
+}
+
static int pagefault_preprocess_sample(const event_t *self,
struct addr_location *al,
struct sample_data *data,
@@ -1036,6 +1074,8 @@ process_raw_event(event_t *self, void *d
pagefault_exit(data, event, cpu, timestamp, thread);
if (!strcmp(event->name, "vfs_getname"))
vfs_getname(data, event, cpu, timestamp, thread);
+ if (!strcmp(event->name, "user"))
+ user_event(data, event, cpu, timestamp, thread);

if (!scheduler_events && !scheduler_all_events)
return;
@@ -1226,6 +1266,7 @@ static const char *record_args[] = {
"-e", "sched:sched_stat_sleep:r",
"-e", "sched:sched_stat_iowait:r",
"-e", "sched:sched_stat_runtime:r",
+ "-e", "user:user:r",
};

static int __cmd_record(int argc, const char **argv)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/