On Wed, 30 Jul 2008 17:13:24 -0400 (EDT)
Steven Rostedt <rostedt@xxxxxxxxxxx> wrote:
At OLS I had a lot of interest to be able to have the ftrace buffers
dumped on panic. Usually one would expect to uses kexec and examine
the buffers after a new kernel is loaded. But sometimes the resources
do not permit kdump and kexec, so having an option to still see the
sequence of events up to the crash is very advantageous.
This patch adds the option to have the ftrace buffers dumped to the
console in the latency_trace format on a panic. When the option is set,
the default entries per CPU buffer are lowered to 16384, since the writing
to the serial (if that is the console) may take an aweful long time
otherwise.
The modern spelling is "awful" ;)
Signed-off-by: Steven Rostedt <srostedt@xxxxxxxxxx>
---
include/linux/ftrace.h | 6 +
kernel/trace/Kconfig | 10 ++
kernel/trace/trace.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 194 insertions(+), 1 deletion(-)
Index: linux-tip.git/kernel/trace/trace.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.c 2008-07-29 10:47:53.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.c 2008-07-30 16:46:15.000000000 -0400
@@ -14,6 +14,7 @@
#include <linux/utsrelease.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/notifier.h>
Your email client space-stuffed the patch.
#include <linux/debugfs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
@@ -22,6 +23,7 @@
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
+#include <linux/kdebug.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
@@ -104,7 +106,16 @@ int ftrace_function_enabled;
* for a buffer. Note, the number of entries is always rounded
* to ENTRIES_PER_PAGE.
*/
-static unsigned long trace_nr_entries = 65536UL;
+#ifdef CONFIG_FTRACE_DUMP_ON_OOPS
+/*
+ * If dump on oops is defined, lower the number of entries.
+ * Usually this goes over the serial, and is used for debugging.
I suspect that (netconsole > serial) nowadays.
+ */
+# define TRACE_ENTRIES_DEFAULT 16384UL
+#else
+# define TRACE_ENTRIES_DEFAULT 65536UL
+#endif
+static unsigned long trace_nr_entries = TRACE_ENTRIES_DEFAULT;
/* trace_types holds a link list of available tracers. */
static struct tracer *trace_types __read_mostly;
@@ -2930,6 +2941,166 @@ static __init void tracer_init_debugfs(v
#endif
}
+#ifdef CONFIG_FTRACE_DUMP_ON_OOPS
+static int trace_panic_handler(struct notifier_block *this,
+ unsigned long event, void *unused)
+{
+ ftrace_dump();
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_panic_notifier = {
+ .notifier_call = trace_panic_handler,
+ .next = NULL,
+ .priority = 150 /* priority: INT_MAX >= x >= 0 */
+};
+
+static int trace_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ switch (val) {
+ case DIE_OOPS:
+ ftrace_dump();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_die_notifier = {
+ .notifier_call = trace_die_handler,
+ .priority = 200
+};
+
+static void
+trace_printk_seq(struct trace_seq *s)
+{
+ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
That's an open-coded min().
Beware that some architectures' PAGE_SIZE has unsigned long type and
others have a different type (unsigned?) so a min() on PAGE_SIZE will
warn on some architectures and not on others. min_t will fix that.
+ s->buffer[len] = 0;
+ printk("%s", s->buffer);
If the null-termination of ->buffer[] was actually needed then we have
a big problem here.
I'd suggest a conversion to vsnprintf().
+ trace_seq_reset(s);
+}
+
+
+static struct trace_iterator dump_iter;
+static raw_spinlock_t ftrace_dump_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
argh. Please never do an uncommented raw_spinlock_t. It's just
impossible for your readers to work out ytf it was chosen.
+static int dump_running;
+
+void ftrace_dump(void)
+{
+ struct trace_iterator *iter = &dump_iter;
+ struct trace_array_cpu *data;
+ static cpumask_t mask;
+ unsigned long flags;
+ int cpu, ret;
+ int cnt = 0;
+
+ local_irq_save(flags);
+
+ /* only one dump at a time */
+ __raw_spin_lock(&ftrace_dump_lock);
Should we have a __raw_spinlock_irqsave()?
+ if (dump_running)
+ ret = 1;
+ else {
+ dump_running = 1;
+ ret = 0;
+ }
+ __raw_spin_unlock(&ftrace_dump_lock);
+
+ if (ret)
+ goto out;
+
+ /* No turning back! */
+ ftrace_kill_atomic();
+
+ printk("Dumping ftrace buffer:\n");
didst thou run checkpatch?
+ memset(iter, 0, sizeof(*iter));
This is zeroing out dump_iter, which is already zeroed. Is it needed?
+ iter->tr = &global_trace;
+ iter->trace = current_trace;
The use of local variable `iter' instead of simply using direct access
to dump_iter declarifies things. Perhaps it generates significantly
better code? If not, I'd suggest doing it the simple way.
dump_iter could be made local to this function.
+ /*
+ * We need to stop all tracing on all CPUS to read the
+ * the next buffer. This is a bit expensive, but is
+ * not done often. We fill all what we can read,
+ * and then release the locks again.
+ */
+
+ cpus_clear(mask);
+
+ for_each_tracing_cpu(cpu) {
+ data = iter->tr->data[cpu];
+
+ if (!head_page(data) || !data->trace_idx)
+ continue;
+
+ atomic_inc(&data->disabled);
+ cpu_set(cpu, mask);
+ }
+
+ for_each_cpu_mask(cpu, mask) {
+ data = iter->tr->data[cpu];
+ __raw_spin_lock(&data->lock);
+
+ if (data->overrun > iter->last_overrun[cpu])
+ iter->overrun[cpu] +=
+ data->overrun - iter->last_overrun[cpu];
+ iter->last_overrun[cpu] = data->overrun;
+ }
+
+ while (!trace_empty(iter)) {
+
+ if (!cnt)
+ printk("---------------------------------\n");
+
+ cnt++;
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter->seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+ iter->pos = -1;
+
+ if (find_next_entry_inc(iter) != NULL) {
+ print_trace_line(iter);
+ trace_consume(iter);
+ }
+
+ trace_printk_seq(&iter->seq);
+ }
+
+ if (!cnt)
+ printk(" (ftrace buffer empty)\n");
+ else
+ printk("---------------------------------\n");
+
+ for_each_cpu_mask(cpu, mask) {
+ data = iter->tr->data[cpu];
+ __raw_spin_unlock(&data->lock);
+ }
+
+ for_each_cpu_mask(cpu, mask) {
+ data = iter->tr->data[cpu];
+ atomic_dec(&data->disabled);
+ }
Can we combine those two loops into one?
+ __raw_spin_lock(&ftrace_dump_lock);
+ dump_running = 0;
+ __raw_spin_unlock(&ftrace_dump_lock);
+
+ out:
+ local_irq_restore(flags);
+}
+#endif /* CONFIG_FTRACE_DUMP_ON_OOPS */
Do we really need a new config option for this? Would it hurt too much
to make it unconditionally available?