[RFC 1/3] Linux Kernel Markers - Support Multiple Probes

From: Mathieu Desnoyers
Date: Tue Nov 13 2007 - 14:07:49 EST


RCU style multiple probes support for the Linux Kernel Markers.
Common case (one probe) is still fast and does not require dynamic allocation
or a supplementary pointer dereference on the fast path.

- Move preempt disable from the marker site to the callback.

Since we now have an internal callback, move the preempt disable/enable to the
callback instead of the marker site.

Since the callback change is done asynchronously (passing from a handler that
supports arguments to a handler that does not setup the arguments is no
arguments are passed), we can safely update it even if it is outside the preempt
disable section.

- Move probe arm to probe connection. Now, a connected probe is automatically
armed.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxx>
---
include/linux/marker.h | 51 +--
include/linux/module.h | 2
kernel/marker.c | 663 +++++++++++++++++++++++++++++-----------
kernel/module.c | 7
samples/markers/probe-example.c | 33 +
5 files changed, 546 insertions(+), 210 deletions(-)

Index: linux-2.6-lttng/include/linux/marker.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/marker.h 2007-11-13 09:48:35.000000000 -0500
+++ linux-2.6-lttng/include/linux/marker.h 2007-11-13 09:48:41.000000000 -0500
@@ -19,16 +19,23 @@ struct marker;

/**
* marker_probe_func - Type of a marker probe function
- * @mdata: pointer of type struct marker
- * @private_data: caller site private data
+ * @probe_data: probe private data
+ * @private_data: call site private data
* @fmt: format string
- * @...: variable argument list
+ * @args: variable argument list pointer. Use a pointer to overcome C's
+ * inability to pass this around as a pointer in a portable manner in
+ * the callee otherwise.
*
* Type of marker probe functions. They receive the mdata and need to parse the
* format string to recover the variable argument list.
*/
-typedef void marker_probe_func(const struct marker *mdata,
- void *private_data, const char *fmt, ...);
+typedef void marker_probe_func(void *probe_data, void *call_data,
+ const char *fmt, va_list *args);
+
+struct marker_probe_closure {
+ marker_probe_func *func; /* Callback */
+ void *private; /* Private probe data */
+};

struct marker {
const char *name; /* Marker name */
@@ -36,8 +43,11 @@ struct marker {
* variable argument list.
*/
char state; /* Marker state. */
- marker_probe_func *call;/* Probe handler function pointer */
- void *private; /* Private probe data */
+ char ptype; /* probe type : 0 : single, 1 : multi */
+ void (*call)(const struct marker *mdata, /* Probe wrapper */
+ void *private_data, const char *fmt, ...);
+ struct marker_probe_closure single;
+ struct marker_probe_closure *multi;
} __attribute__((aligned(8)));

#ifdef CONFIG_MARKERS
@@ -60,24 +70,23 @@ struct marker {
static struct marker __mark_##name \
__attribute__((section("__markers"), aligned(8))) = \
{ __mstrtab_name_##name, __mstrtab_format_##name, \
- 0, __mark_empty_function, NULL }; \
+ 0, 0, marker_probe_cb, \
+ { __mark_empty_function, NULL}, NULL }; \
__mark_check_format(format, ## args); \
if (unlikely(__mark_##name.state)) { \
- preempt_disable(); \
(*__mark_##name.call) \
(&__mark_##name, call_data, \
format, ## args); \
- preempt_enable(); \
} \
} while (0)

extern void marker_update_probe_range(struct marker *begin,
- struct marker *end, struct module *probe_module, int *refcount);
+ struct marker *end);
#else /* !CONFIG_MARKERS */
#define __trace_mark(name, call_data, format, args...) \
__mark_check_format(format, ## args)
static inline void marker_update_probe_range(struct marker *begin,
- struct marker *end, struct module *probe_module, int *refcount)
+ struct marker *end)
{ }
#endif /* CONFIG_MARKERS */

@@ -92,8 +101,6 @@ static inline void marker_update_probe_r
#define trace_mark(name, format, args...) \
__trace_mark(name, NULL, format, ## args)

-#define MARK_MAX_FORMAT_LEN 1024
-
/**
* MARK_NOARGS - Format string for a marker with no argument.
*/
@@ -106,6 +113,11 @@ static inline void __printf(1, 2) __mark

extern marker_probe_func __mark_empty_function;

+extern void marker_probe_cb(const struct marker *mdata, void *private,
+ const char *fmt, ...);
+extern void marker_probe_cb_noarg(const struct marker *mdata,
+ void *private, const char *fmt, ...);
+
/*
* Connect a probe to a marker.
* private data pointer must be a valid allocated memory address, or NULL.
@@ -116,14 +128,15 @@ extern int marker_probe_register(const c
/*
* Returns the private data given to marker_probe_register.
*/
-extern void *marker_probe_unregister(const char *name);
+extern int marker_probe_unregister(const char *name,
+ marker_probe_func *probe, void *private);
/*
* Unregister a marker by providing the registered private data.
*/
-extern void *marker_probe_unregister_private_data(void *private);
+extern int marker_probe_unregister_private_data(marker_probe_func *probe,
+ void *private);

-extern int marker_arm(const char *name);
-extern int marker_disarm(const char *name);
-extern void *marker_get_private_data(const char *name);
+extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
+ int num);

#endif
Index: linux-2.6-lttng/kernel/marker.c
===================================================================
--- linux-2.6-lttng.orig/kernel/marker.c 2007-11-13 09:48:36.000000000 -0500
+++ linux-2.6-lttng/kernel/marker.c 2007-11-13 09:48:41.000000000 -0500
@@ -27,35 +27,41 @@
extern struct marker __start___markers[];
extern struct marker __stop___markers[];

+/* Set to 1 to enable marker debug output */
+const int marker_debug;
+
/*
* markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
*/
static DEFINE_MUTEX(markers_mutex);

/*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-
-/*
* Marker hash table, containing the active markers.
* Protected by module_mutex.
*/
#define MARKER_HASH_BITS 6
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)

+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker. It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ */
struct marker_entry {
struct hlist_node hlist;
char *format;
- marker_probe_func *probe;
- void *private;
+ void (*call)(const struct marker *mdata, /* Probe wrapper */
+ void *private_data, const char *fmt, ...);
+ struct marker_probe_closure single;
+ struct marker_probe_closure *multi;
int refcount; /* Number of times armed. 0 if disarmed. */
+ struct rcu_head rcu;
+ void *oldptr;
+ char rcu_pending:1;
+ char ptype:1;
char name[0]; /* Contains name'\0'format'\0' */
};

@@ -63,7 +69,8 @@ static struct hlist_head marker_table[MA

/**
* __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_data: probe private data
+ * @call_call: call site private data
* @fmt: format string
* @...: variable argument list
*
@@ -72,13 +79,258 @@ static struct hlist_head marker_table[MA
* though the function pointer change and the marker enabling are two distinct
* operations that modifies the execution flow of preemptible code.
*/
-void __mark_empty_function(const struct marker *mdata, void *private,
- const char *fmt, ...)
+void __mark_empty_function(void *probe_data, void *call_data,
+ const char *fmt, va_list *args)
{
}
EXPORT_SYMBOL_GPL(__mark_empty_function);

/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @private: caller site private data
+ * @fmt: format string
+ * @...: Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *private,
+ const char *fmt, ...)
+{
+ va_list args;
+ char ptype;
+
+ preempt_disable();
+ ptype = ACCESS_ONCE(mdata->ptype);
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = ACCESS_ONCE(mdata->single.func);
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ va_start(args, fmt);
+ func(mdata->single.private, private, fmt, &args);
+ va_end(args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ multi = ACCESS_ONCE(mdata->multi);
+ for (i = 0; multi[i].func; i++) {
+ va_start(args, fmt);
+ multi[i].func(multi[i].private, private, fmt, &args);
+ va_end(args);
+ }
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @private: caller site private data
+ * @fmt: format string
+ * @...: Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+
+void marker_probe_cb_noarg(const struct marker *mdata,
+ void *private, const char *fmt, ...)
+{
+ va_list args; /* not initialized */
+ char ptype;
+
+ preempt_disable();
+ ptype = ACCESS_ONCE(mdata->ptype);
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = ACCESS_ONCE(mdata->single.func);
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func(mdata->single.private, private, fmt, &args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ multi = ACCESS_ONCE(mdata->multi);
+ for (i = 0; multi[i].func; i++)
+ multi[i].func(multi[i].private, private, fmt, &args);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct marker_entry *entry = container_of(head,
+ struct marker_entry, rcu);
+ kfree(entry->oldptr);
+ /* Make sure we free the data before setting the pending flag to 0 */
+ smp_wmb();
+ entry->rcu_pending = 0;
+}
+
+static inline void debug_print_probes(struct marker_entry *entry)
+{
+ int i;
+
+ if (!marker_debug)
+ return;
+
+ if (!entry->ptype) {
+ printk(KERN_DEBUG "Single probe : %p %p\n",
+ entry->single.func,
+ entry->single.private);
+ } else {
+ for (i = 0; entry->multi[i].func; i++)
+ printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+ entry->multi[i].func, entry->multi[i].private);
+ }
+}
+
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *private)
+{
+ int nr_probes = 0;
+ struct marker_probe_closure *old, *new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->multi;
+ if (!entry->ptype) {
+ if (entry->single.func == probe &&
+ entry->single.private == private)
+ return ERR_PTR(-EBUSY);
+ if (entry->single.func == __mark_empty_function) {
+ /* 0 -> 1 probes */
+ entry->single.func = probe;
+ entry->single.private = private;
+ entry->refcount = 1;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* 1 -> 2 probes */
+ nr_probes = 1;
+ old = NULL;
+ }
+ } else {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe &&
+ old[nr_probes].private == private)
+ return ERR_PTR(-EBUSY);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+ GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (!old)
+ new[0] = entry->single;
+ else
+ memcpy(new, old,
+ nr_probes * sizeof(struct marker_probe_closure));
+ new[nr_probes].func = probe;
+ new[nr_probes].private = private;
+ entry->refcount = nr_probes + 1;
+ entry->multi = new;
+ entry->ptype = 1;
+ debug_print_probes(entry);
+ return old;
+}
+
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *private)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ struct marker_probe_closure *old, *new;
+
+ old = entry->multi;
+
+ debug_print_probes(entry);
+ if (!entry->ptype) {
+ /* 0 -> N is an error */
+ WARN_ON(entry->single.func == __mark_empty_function);
+ /* 1 -> 0 probes */
+ WARN_ON(probe && entry->single.func != probe);
+ WARN_ON(entry->single.private != private);
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if ((!probe || old[nr_probes].func == probe)
+ && old[nr_probes].private == private)
+ nr_del++;
+ }
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ } else if (nr_probes - nr_del == 1) {
+ /* N -> 1, (N > 1) */
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].private != private)
+ entry->single = old[i];
+ entry->refcount = 1;
+ entry->ptype = 0;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 1) */
+ /* + 1 for NULL */
+ new = kzalloc((nr_probes - nr_del + 1)
+ * sizeof(struct marker_probe_closure), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].private != private)
+ new[j++] = old[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->ptype = 1;
+ entry->multi = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
* Get marker if the marker is present in the marker hash table.
* Must be called with markers_mutex held.
* Returns NULL if not present.
@@ -102,8 +354,7 @@ static struct marker_entry *get_marker(c
* Add the marker to the marker hash table. Must be called with markers_mutex
* held.
*/
-static int add_marker(const char *name, const char *format,
- marker_probe_func *probe, void *private)
+static struct marker_entry *add_marker(const char *name, const char *format)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -118,9 +369,8 @@ static int add_marker(const char *name,
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
printk(KERN_NOTICE
- "Marker %s busy, probe %p already installed\n",
- name, e->probe);
- return -EBUSY; /* Already there */
+ "Marker %s busy\n", name);
+ return ERR_PTR(-EBUSY); /* Already there */
}
}
/*
@@ -130,34 +380,42 @@ static int add_marker(const char *name,
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memcpy(&e->name[0], name, name_len);
if (format) {
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
trace_mark(core_marker_format, "name %s format %s",
e->name, e->format);
- } else
+ } else {
e->format = NULL;
- e->probe = probe;
- e->private = private;
+ e->call = marker_probe_cb;
+ }
+ e->single.func = __mark_empty_function;
+ e->single.private = NULL;
+ e->multi = NULL;
+ e->ptype = 0;
e->refcount = 0;
+ e->rcu_pending = 0;
hlist_add_head(&e->hlist, head);
- return 0;
+ return e;
}

/*
* Remove the marker from the marker hash table. Must be called with mutex_lock
* held.
*/
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
int found = 0;
size_t len = strlen(name) + 1;
- void *private = NULL;
u32 hash = jhash(name, len-1, 0);

head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +425,16 @@ static void *remove_marker(const char *n
break;
}
}
- if (found) {
- private = e->private;
- hlist_del(&e->hlist);
- kfree(e);
- }
- return private;
+ if (!found)
+ return -ENOENT;
+ if (e->single.func != __mark_empty_function)
+ return -EBUSY;
+ hlist_del(&e->hlist);
+ /* Make sure the call_rcu has been executed */
+ if (e->rcu_pending)
+ rcu_barrier();
+ kfree(e);
+ return 0;
}

/*
@@ -184,6 +446,7 @@ static int marker_set_format(struct mark
size_t name_len = strlen((*entry)->name) + 1;
size_t format_len = strlen(format) + 1;

+
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
@@ -191,11 +454,20 @@ static int marker_set_format(struct mark
memcpy(&e->name[0], (*entry)->name, name_len);
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
- e->probe = (*entry)->probe;
- e->private = (*entry)->private;
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
+ e->single = (*entry)->single;
+ e->multi = (*entry)->multi;
+ e->ptype = (*entry)->ptype;
e->refcount = (*entry)->refcount;
+ e->rcu_pending = 0;
hlist_add_before(&e->hlist, &(*entry)->hlist);
hlist_del(&(*entry)->hlist);
+ /* Make sure the call_rcu has been executed */
+ if ((*entry)->rcu_pending)
+ rcu_barrier();
kfree(*entry);
*entry = e;
trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +478,8 @@ static int marker_set_format(struct mark
/*
* Sets the probe callback corresponding to one marker.
*/
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+ int active)
{
int ret;
WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +499,43 @@ static int set_marker(struct marker_entr
if (ret)
return ret;
}
- elem->call = (*entry)->probe;
- elem->private = (*entry)->private;
- elem->state = 1;
+
+ /*
+ * probe_cb setup (statically known) is done here. It is
+ * asynchronous with the rest of execution, therefore we only
+ * pass from a "safe" callback (with argument) to an "unsafe"
+ * callback (does not set arguments).
+ */
+ elem->call = (*entry)->call;
+ /*
+ * Sanity check :
+ * We only update the single probe private data when the ptr is
+ * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+ */
+ WARN_ON(elem->single.func != __mark_empty_function
+ && elem->single.private
+ != (*entry)->single.private &&
+ !elem->ptype);
+ elem->single.private = (*entry)->single.private;
+ /*
+ * Make sure the private data is valid when we update the
+ * single probe ptr.
+ */
+ smp_wmb();
+ elem->single.func = (*entry)->single.func;
+ /*
+ * We also make sure that the new probe callbacks array is consistent
+ * before setting a pointer to it.
+ */
+ rcu_assign_pointer(elem->multi, (*entry)->multi);
+ /*
+ * Update the function or multi probe array pointer before setting the
+ * ptype.
+ */
+ smp_wmb();
+ elem->ptype = (*entry)->ptype;
+ elem->state = active;
+
return 0;
}

@@ -240,8 +547,12 @@ static int set_marker(struct marker_entr
*/
static void disable_marker(struct marker *elem)
{
+ /* leave "call" as is. It is known statically. */
elem->state = 0;
- elem->call = __mark_empty_function;
+ elem->single.func = __mark_empty_function;
+ /* Update the function before setting the ptype */
+ smp_wmb();
+ elem->ptype = 0; /* single probe */
/*
* Leave the private data and id there, because removal is racy and
* should be done only after a synchronize_sched(). These are never used
@@ -253,14 +564,11 @@ static void disable_marker(struct marker
* marker_update_probe_range - Update a probe range
* @begin: beginning of the range
* @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
*
* Updates the probe callback corresponding to a range of markers.
*/
void marker_update_probe_range(struct marker *begin,
- struct marker *end, struct module *probe_module,
- int *refcount)
+ struct marker *end)
{
struct marker *iter;
struct marker_entry *mark_entry;
@@ -268,15 +576,12 @@ void marker_update_probe_range(struct ma
mutex_lock(&markers_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_marker(iter->name);
- if (mark_entry && mark_entry->refcount) {
- set_marker(&mark_entry, iter);
+ if (mark_entry) {
+ set_marker(&mark_entry, iter,
+ !!mark_entry->refcount);
/*
* ignore error, continue
*/
- if (probe_module)
- if (probe_module ==
- __module_text_address((unsigned long)mark_entry->probe))
- (*refcount)++;
} else {
disable_marker(iter);
}
@@ -289,20 +594,27 @@ void marker_update_probe_range(struct ma
* Issues a synchronize_sched() when no reference to the module passed
* as parameter is found in the probes so the probe module can be
* safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions. All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
*/
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
{
- int refcount = 0;
-
/* Core kernel markers */
- marker_update_probe_range(__start___markers,
- __stop___markers, probe_module, &refcount);
+ marker_update_probe_range(__start___markers, __stop___markers);
/* Markers in modules. */
- module_update_markers(probe_module, &refcount);
- if (probe_module && refcount == 0) {
- synchronize_sched();
- deferred_sync = 0;
- }
+ module_update_markers();
}

/**
@@ -314,29 +626,45 @@ static void marker_update_probes(struct
*
* private data must be a valid allocated memory address, or NULL.
* Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
*/
int marker_probe_register(const char *name, const char *format,
marker_probe_func *probe, void *private)
{
struct marker_entry *entry;
int ret = 0;
+ struct marker_probe_closure *old;

mutex_lock(&markers_mutex);
entry = get_marker(name);
- if (entry && entry->refcount) {
- ret = -EBUSY;
- goto end;
- }
- if (deferred_sync) {
- synchronize_sched();
- deferred_sync = 0;
+ if (!entry) {
+ entry = add_marker(name, format);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto end;
+ }
}
- ret = add_marker(name, format, probe, private);
- if (ret)
+ /*
+ * If we detect that a call_rcu is pending for this marker,
+ * make sure it's executed now.
+ */
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_add_probe(entry, probe, private);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
goto end;
+ }
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
- return ret;
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ WARN_ON(!entry);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu(&entry->rcu, free_old_closure);
end:
mutex_unlock(&markers_mutex);
return ret;
@@ -346,171 +674,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register)
/**
* marker_probe_unregister - Disconnect a probe from a marker
* @name: marker name
+ * @probe: probe function pointer
+ * @private: probe private data
*
* Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
*/
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+ marker_probe_func *probe, void *private)
{
- struct module *probe_module;
struct marker_entry *entry;
- void *private;
+ struct marker_probe_closure *old;
+ int ret = 0;

mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry) {
- private = ERR_PTR(-ENOENT);
+ ret = -ENOENT;
goto end;
}
- entry->refcount = 0;
- /* In what module is the probe handler ? */
- probe_module = __module_text_address((unsigned long)entry->probe);
- private = remove_marker(name);
- deferred_sync = 1;
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_remove_probe(entry, probe, private);
mutex_unlock(&markers_mutex);
- marker_update_probes(probe_module);
- return private;
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu(&entry->rcu, free_old_closure);
+ remove_marker(name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
- return private;
+ return ret;
}
EXPORT_SYMBOL_GPL(marker_probe_unregister);

-/**
- * marker_probe_unregister_private_data - Disconnect a probe from a marker
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *private)
{
- struct module *probe_module;
- struct hlist_head *head;
- struct hlist_node *node;
struct marker_entry *entry;
- int found = 0;
unsigned int i;
+ struct hlist_head *head;
+ struct hlist_node *node;

- mutex_lock(&markers_mutex);
for (i = 0; i < MARKER_TABLE_SIZE; i++) {
head = &marker_table[i];
hlist_for_each_entry(entry, node, head, hlist) {
- if (entry->private == private) {
- found = 1;
- goto iter_end;
+ if (!entry->ptype) {
+ if (entry->single.func == probe
+ && entry->single.private
+ == private)
+ return entry;
+ } else {
+ struct marker_probe_closure *closure;
+ closure = entry->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func == probe &&
+ closure[i].private
+ == private)
+ return entry;
+ }
}
}
}
-iter_end:
- if (!found) {
- private = ERR_PTR(-ENOENT);
- goto end;
- }
- entry->refcount = 0;
- /* In what module is the probe handler ? */
- probe_module = __module_text_address((unsigned long)entry->probe);
- private = remove_marker(entry->name);
- deferred_sync = 1;
- mutex_unlock(&markers_mutex);
- marker_update_probes(probe_module);
- return private;
-end:
- mutex_unlock(&markers_mutex);
- return private;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);

/**
- * marker_arm - Arm a marker
- * @name: marker name
+ * marker_probe_unregister_private_data - Disconnect a probe from a marker
+ * @probe: probe function
+ * @private: probe private data
*
- * Activate a marker. It keeps a reference count of the number of
- * arming/disarming done.
- * Returns 0 if ok, error value on error.
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
*/
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+ void *private)
{
struct marker_entry *entry;
int ret = 0;
+ struct marker_probe_closure *old;

mutex_lock(&markers_mutex);
- entry = get_marker(name);
+ entry = get_marker_from_private_data(probe, private);
if (!entry) {
ret = -ENOENT;
goto end;
}
- /*
- * Only need to update probes when refcount passes from 0 to 1.
- */
- if (entry->refcount++)
- goto end;
-end:
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_remove_probe(entry, NULL, private);
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
- return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
- struct marker_entry *entry;
- int ret = 0;
-
+ marker_update_probes(); /* may update entry */
mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry) {
- ret = -ENOENT;
- goto end;
- }
- /*
- * Only permit decrement refcount if higher than 0.
- * Do probe update only on 1 -> 0 transition.
- */
- if (entry->refcount) {
- if (--entry->refcount)
- goto end;
- } else {
- ret = -EPERM;
- goto end;
- }
+ entry = get_marker_from_private_data(probe, private);
+ WARN_ON(!entry);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu(&entry->rcu, free_old_closure);
+ remove_marker(entry->name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
return ret;
}
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);

/**
* marker_get_private_data - Get a marker's probe private data
* @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
*
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
* Returns the private data pointer, or an ERR_PTR.
* The private data pointer should _only_ be dereferenced if the caller is the
* owner of the data, or its content could vanish. This is mostly used to
* confirm that a caller is the owner of a registered probe.
*/
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+ int num)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
size_t name_len = strlen(name) + 1;
u32 hash = jhash(name, name_len-1, 0);
- int found = 0;
+ int i;

head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
- found = 1;
- return e->private;
+ if (!e->ptype) {
+ if (num == 0 && e->single.func == probe)
+ return e->single.private;
+ else
+ break;
+ } else {
+ struct marker_probe_closure *closure;
+ int match = 0;
+ closure = e->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func != probe)
+ continue;
+ if (match++ == num)
+ return closure[i].private;
+ }
+ }
}
}
return ERR_PTR(-ENOENT);
Index: linux-2.6-lttng/kernel/module.c
===================================================================
--- linux-2.6-lttng.orig/kernel/module.c 2007-11-13 09:48:36.000000000 -0500
+++ linux-2.6-lttng/kernel/module.c 2007-11-13 09:48:41.000000000 -0500
@@ -2000,7 +2000,7 @@ static struct module *load_module(void _
if (!mod->taints) {
#ifdef CONFIG_MARKERS
marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers, NULL, NULL);
+ mod->markers + mod->num_markers);
#endif
#ifdef CONFIG_IMMEDIATE
immediate_update_range(mod->immediate,
@@ -2600,7 +2600,7 @@ EXPORT_SYMBOL(struct_module);
#endif

#ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
{
struct module *mod;

@@ -2608,8 +2608,7 @@ void module_update_markers(struct module
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers,
- probe_module, refcount);
+ mod->markers + mod->num_markers);
mutex_unlock(&module_mutex);
}
#endif
Index: linux-2.6-lttng/include/linux/module.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/module.h 2007-11-13 09:48:36.000000000 -0500
+++ linux-2.6-lttng/include/linux/module.h 2007-11-13 09:48:41.000000000 -0500
@@ -467,7 +467,7 @@ int unregister_module_notifier(struct no

extern void print_modules(void);

-extern void module_update_markers(struct module *probe_module, int *refcount);
+extern void module_update_markers(void);

extern void _module_immediate_update(void);
extern void module_immediate_update(void);
Index: linux-2.6-lttng/samples/markers/probe-example.c
===================================================================
--- linux-2.6-lttng.orig/samples/markers/probe-example.c 2007-11-13 09:48:36.000000000 -0500
+++ linux-2.6-lttng/samples/markers/probe-example.c 2007-11-13 09:48:41.000000000 -0500
@@ -20,31 +20,35 @@ struct probe_data {
marker_probe_func *probe_func;
};

-void probe_subsystem_event(const struct marker *mdata, void *private,
- const char *format, ...)
+void __attribute__((__aligned__(2)))
+probe_subsystem_event(void *probe_data, void *call_data,
+ const char *format, va_list *args);
+void
+probe_subsystem_event(void *probe_data, void *call_data,
+ const char *format, va_list *args)
{
- va_list ap;
/* Declare args */
unsigned int value;
const char *mystr;

/* Assign args */
- va_start(ap, format);
- value = va_arg(ap, typeof(value));
- mystr = va_arg(ap, typeof(mystr));
+ value = va_arg(*args, typeof(value));
+ mystr = va_arg(*args, typeof(mystr));

/* Call printk */
- printk(KERN_DEBUG "Value %u, string %s\n", value, mystr);
+ printk(KERN_INFO "Value %u, string %s\n", value, mystr);

/* or count, check rights, serialize data in a buffer */
-
- va_end(ap);
}

atomic_t eventb_count = ATOMIC_INIT(0);

-void probe_subsystem_eventb(const struct marker *mdata, void *private,
- const char *format, ...)
+void
+probe_subsystem_eventb(void *probe_data, void *call_data,
+ const char *format, va_list *args) __attribute__((aligned(2)));
+void
+probe_subsystem_eventb(void *probe_data, void *call_data,
+ const char *format, va_list *args)
{
/* Increment counter */
atomic_inc(&eventb_count);
@@ -72,10 +76,6 @@ static int __init probe_init(void)
if (result)
printk(KERN_INFO "Unable to register probe %s\n",
probe_array[i].name);
- result = marker_arm(probe_array[i].name);
- if (result)
- printk(KERN_INFO "Unable to arm probe %s\n",
- probe_array[i].name);
}
return 0;
}
@@ -85,7 +85,8 @@ static void __exit probe_fini(void)
int i;

for (i = 0; i < ARRAY_SIZE(probe_array); i++)
- marker_probe_unregister(probe_array[i].name);
+ marker_probe_unregister(probe_array[i].name,
+ probe_array[i].probe_func, &probe_array[i]);
printk(KERN_INFO "Number of event b : %u\n",
atomic_read(&eventb_count));
}

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/