Re: [BUG] long freezes on thinkpad t60

From: Miklos Szeredi
Date: Thu May 24 2007 - 10:05:22 EST


> > On some strange workload involving strace and fuse I get ocasional
> > long periods (10-100s) of total unresponsiveness, not even SysRq-*
> > working. Then the machine continues as normal. Nothing in dmesg,
> > absolutely no indication about what is happening.
>
> > Any ideas? Possibly something ACPI related?
>
> how reproducable are these lockups - could you possibly trace it? If yes
> then please apply:
>
> http://www.tglx.de/private/tglx/ht-debug/tracer.diff

With this patch boot stops at segfaulting fsck. I enabled all the new
config options, is that not a good idea? Which one exactly do I need?

Thanks,
Miklos

PS. tracer.diff needed some hacking to make it apply/compile.

Index: linux-2.6.22-rc2/include/linux/linkage.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/linkage.h 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/linkage.h 2007-05-24 15:57:43.000000000 +0200
@@ -3,6 +3,8 @@

#include <asm/linkage.h>

+#define notrace __attribute ((no_instrument_function))
+
#ifdef __cplusplus
#define CPP_ASMLINKAGE extern "C"
#else
Index: linux-2.6.22-rc2/Documentation/stable_api_nonsense.txt
===================================================================
--- linux-2.6.22-rc2.orig/Documentation/stable_api_nonsense.txt 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/Documentation/stable_api_nonsense.txt 2007-05-24 15:57:42.000000000 +0200
@@ -62,6 +62,9 @@ consider the following facts about the L
- different structures can contain different fields
- Some functions may not be implemented at all, (i.e. some locks
compile away to nothing for non-SMP builds.)
+ - Parameter passing of variables from function to function can be
+ done in different ways (the CONFIG_REGPARM option controls
+ this.)
- Memory within the kernel can be aligned in different ways,
depending on the build options.
- Linux runs on a wide range of different processor architectures.
Index: linux-2.6.22-rc2/arch/i386/Kconfig
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/Kconfig 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/Kconfig 2007-05-24 15:57:42.000000000 +0200
@@ -764,6 +764,14 @@ config BOOT_IOREMAP
depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
default y

+#
+# function tracing might turn this off:
+#
+config REGPARM
+ bool
+ depends on !MCOUNT
+ default y
+
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
depends on PROC_FS
Index: linux-2.6.22-rc2/arch/i386/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/Makefile 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/Makefile 2007-05-24 15:57:42.000000000 +0200
@@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs
endif
CHECKFLAGS += -D__i386__

-CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return
+CFLAGS += -pipe -msoft-float

# prevent gcc from keeping the stack 16 byte aligned
CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
@@ -39,6 +39,8 @@ CFLAGS += $(call cc-option,-mpreferred-s
# CPU-specific tuning. Anything which can be shared with UML should go here.
include $(srctree)/arch/i386/Makefile.cpu

+cflags-$(CONFIG_REGPARM) += -mregparm=3 -freg-struct-return
+
# temporary until string.h is fixed
cflags-y += -ffreestanding

Index: linux-2.6.22-rc2/include/asm-i386/module.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-i386/module.h 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-i386/module.h 2007-05-24 15:57:43.000000000 +0200
@@ -64,12 +64,18 @@ struct mod_arch_specific
#error unknown processor family
#endif

+#ifdef CONFIG_REGPARM
+#define MODULE_REGPARM "REGPARM "
+#else
+#define MODULE_REGPARM ""
+#endif
+
#ifdef CONFIG_4KSTACKS
#define MODULE_STACKSIZE "4KSTACKS "
#else
#define MODULE_STACKSIZE ""
#endif

-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE

#endif /* _ASM_I386_MODULE_H */
Index: linux-2.6.22-rc2/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/Makefile 2007-05-22 16:25:01.000000000 +0200
+++ linux-2.6.22-rc2/Makefile 2007-05-24 15:57:42.000000000 +0200
@@ -490,10 +490,14 @@ endif

include $(srctree)/arch/$(ARCH)/Makefile

-ifdef CONFIG_FRAME_POINTER
-CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ifdef CONFIG_MCOUNT
+CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
else
-CFLAGS += -fomit-frame-pointer
+ ifdef CONFIG_FRAME_POINTER
+ CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ else
+ CFLAGS += -fomit-frame-pointer
+ endif
endif

ifdef CONFIG_DEBUG_INFO
Index: linux-2.6.22-rc2/arch/i386/lib/delay.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/lib/delay.c 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/lib/delay.c 2007-05-24 15:57:43.000000000 +0200
@@ -23,7 +23,7 @@
#endif

/* simple loop based delay: */
-static void delay_loop(unsigned long loops)
+static notrace void delay_loop(unsigned long loops)
{
int d0;

@@ -38,7 +38,7 @@ static void delay_loop(unsigned long loo
}

/* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static notrace void delay_tsc(unsigned long loops)
{
unsigned long bclock, now;

@@ -69,7 +69,7 @@ int read_current_timer(unsigned long *ti
return -1;
}

-void __delay(unsigned long loops)
+void notrace __delay(unsigned long loops)
{
delay_fn(loops);
}
Index: linux-2.6.22-rc2/arch/x86_64/kernel/tsc.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/tsc.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/tsc.c 2007-05-24 15:57:42.000000000 +0200
@@ -176,13 +176,13 @@ __setup("notsc", notsc_setup);


/* clock source code: */
-static cycle_t read_tsc(void)
+static notrace cycle_t read_tsc(void)
{
cycle_t ret = (cycle_t)get_cycles_sync();
return ret;
}

-static cycle_t __vsyscall_fn vread_tsc(void)
+static notrace cycle_t __vsyscall_fn vread_tsc(void)
{
cycle_t ret = (cycle_t)get_cycles_sync();
return ret;
Index: linux-2.6.22-rc2/drivers/clocksource/acpi_pm.c
===================================================================
--- linux-2.6.22-rc2.orig/drivers/clocksource/acpi_pm.c 2007-05-22 16:25:11.000000000 +0200
+++ linux-2.6.22-rc2/drivers/clocksource/acpi_pm.c 2007-05-24 15:57:42.000000000 +0200
@@ -30,13 +30,13 @@
*/
u32 pmtmr_ioport __read_mostly;

-static inline u32 read_pmtmr(void)
+static notrace inline u32 read_pmtmr(void)
{
/* mask the output to 24 bits */
return inl(pmtmr_ioport) & ACPI_PM_MASK;
}

-u32 acpi_pm_read_verified(void)
+u32 notrace acpi_pm_read_verified(void)
{
u32 v1 = 0, v2 = 0, v3 = 0;

@@ -56,12 +56,12 @@ u32 acpi_pm_read_verified(void)
return v2;
}

-static cycle_t acpi_pm_read_slow(void)
+static notrace cycle_t acpi_pm_read_slow(void)
{
return (cycle_t)acpi_pm_read_verified();
}

-static cycle_t acpi_pm_read(void)
+static notrace cycle_t acpi_pm_read(void)
{
return (cycle_t)read_pmtmr();
}
Index: linux-2.6.22-rc2/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.22-rc2.orig/fs/proc/proc_misc.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/fs/proc/proc_misc.c 2007-05-24 15:57:42.000000000 +0200
@@ -623,6 +623,20 @@ static int execdomains_read_proc(char *p
return proc_calc_metrics(page, start, off, count, eof, len);
}

+#ifdef CONFIG_EVENT_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+ .open = latency_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif
+
#ifdef CONFIG_MAGIC_SYSRQ
/*
* writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -716,6 +730,9 @@ void __init proc_misc_init(void)
#ifdef CONFIG_SCHEDSTATS
create_seq_entry("schedstat", 0, &proc_schedstat_operations);
#endif
+#ifdef CONFIG_EVENT_TRACE
+ create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
#ifdef CONFIG_PROC_KCORE
proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
if (proc_root_kcore) {
Index: linux-2.6.22-rc2/include/linux/clocksource.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/clocksource.h 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/clocksource.h 2007-05-24 15:57:43.000000000 +0200
@@ -21,6 +21,9 @@
typedef u64 cycle_t;
struct clocksource;

+extern unsigned long preempt_max_latency;
+extern unsigned long preempt_thresh;
+
/**
* struct clocksource - hardware abstraction for a free running counter
* Provides mostly state-free accessors to the underlying hardware.
@@ -172,8 +175,20 @@ static inline cycle_t clocksource_read(s
*/
static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
{
- u64 ret = (u64)cycles;
- ret = (ret * cs->mult) >> cs->shift;
+ return ((u64)cycles * cs->mult) >> cs->shift;
+}
+
+/**
+ * ns2cyc - converts nanoseconds to clocksource cycles
+ * @cs: Pointer to clocksource
+ * @nsecs: Nanoseconds
+ */
+static inline cycles_t ns2cyc(struct clocksource *cs, u64 nsecs)
+{
+ cycles_t ret = nsecs << cs->shift;
+
+ do_div(ret, cs->mult + 1);
+
return ret;
}

@@ -221,4 +236,8 @@ static inline void update_vsyscall(struc
}
#endif

+extern cycle_t get_monotonic_cycles(void);
+extern unsigned long cycles_to_usecs(cycle_t);
+extern cycle_t usecs_to_cycles(unsigned long);
+
#endif /* _LINUX_CLOCKSOURCE_H */
Index: linux-2.6.22-rc2/include/linux/kernel.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/kernel.h 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/kernel.h 2007-05-24 15:57:43.000000000 +0200
@@ -156,6 +156,8 @@ asmlinkage int vprintk(const char *fmt,
__attribute__ ((format (printf, 1, 0)));
asmlinkage int printk(const char * fmt, ...)
__attribute__ ((format (printf, 1, 2)));
+extern void early_printk(const char *fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
#else
static inline int vprintk(const char *s, va_list args)
__attribute__ ((format (printf, 1, 0)));
Index: linux-2.6.22-rc2/include/linux/latency_hist.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/include/linux/latency_hist.h 2007-05-24 15:57:43.000000000 +0200
@@ -0,0 +1,32 @@
+/*
+ * kernel/latency_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ * Copyright (C) 2005 MontaVista Software, Inc.
+ * Yi Yang <yyang@xxxxxxxxxxxxx>
+ *
+ */
+#ifndef _LINUX_LATENCY_HIST_H_
+#define _LINUX_LATENCY_HIST_H_
+
+enum {
+ INTERRUPT_LATENCY = 0,
+ PREEMPT_LATENCY,
+ WAKEUP_LATENCY
+};
+
+#define MAX_ENTRY_NUM 10240
+#define LATENCY_TYPE_NUM 3
+
+#ifdef CONFIG_LATENCY_HIST
+extern void latency_hist(int latency_type, int cpu, unsigned long latency);
+# define latency_hist_flag 1
+#else
+# define latency_hist(a,b,c) do { (void)(cpu); } while (0)
+# define latency_hist_flag 0
+#endif /* CONFIG_LATENCY_HIST */
+
+#endif /* ifndef _LINUX_LATENCY_HIST_H_ */
Index: linux-2.6.22-rc2/include/linux/preempt.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/preempt.h 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/preempt.h 2007-05-24 15:57:43.000000000 +0200
@@ -9,12 +9,26 @@
#include <linux/thread_info.h>
#include <linux/linkage.h>

-#ifdef CONFIG_DEBUG_PREEMPT
- extern void fastcall add_preempt_count(int val);
- extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+ extern void notrace add_preempt_count(unsigned int val);
+ extern void notrace sub_preempt_count(unsigned int val);
+ extern void notrace mask_preempt_count(unsigned int mask);
+ extern void notrace unmask_preempt_count(unsigned int mask);
#else
# define add_preempt_count(val) do { preempt_count() += (val); } while (0)
# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0)
+# define mask_preempt_count(mask) \
+ do { preempt_count() |= (mask); } while (0)
+# define unmask_preempt_count(mask) \
+ do { preempt_count() &= ~(mask); } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_TIMING
+ extern void touch_critical_timing(void);
+ extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing() do { } while (0)
+# define stop_critical_timing() do { } while (0)
#endif

#define inc_preempt_count() add_preempt_count(1)
Index: linux-2.6.22-rc2/include/linux/sched.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/sched.h 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/sched.h 2007-05-24 15:57:43.000000000 +0200
@@ -215,6 +215,7 @@ static inline void show_state(void)
}

extern void show_regs(struct pt_regs *);
+extern void irq_show_regs_callback(int cpu, struct pt_regs *regs);

/*
* TASK is a pointer to the task whose backtrace we want to see (or NULL for current
@@ -251,6 +252,105 @@ static inline void touch_all_softlockup_
}
#endif

+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE)
+ extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task) do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+# else
+ extern unsigned long arm_return_addr(int level);
+# define CALLER_ADDR0 arm_return_addr(0)
+# define CALLER_ADDR1 arm_return_addr(1)
+# define CALLER_ADDR2 arm_return_addr(2)
+# define CALLER_ADDR3 arm_return_addr(3)
+# define CALLER_ADDR4 arm_return_addr(4)
+# define CALLER_ADDR5 arm_return_addr(5)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+ extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_EVENT_TRACE
+ extern int mcount_enabled, trace_enabled, trace_user_triggered,
+ trace_user_trigger_irq, trace_freerunning, trace_verbose,
+ trace_print_on_crash, trace_all_cpus, print_functions,
+ syscall_tracing, stackframe_tracing, trace_use_raw_cycles,
+ trace_all_runnable;
+ extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+ extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+ extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+ extern void notrace trace_special_sym(void);
+ extern void stop_trace(void);
+# define start_trace() do { trace_enabled = 1; } while (0)
+ extern void print_last_trace(void);
+ extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+ unsigned long flags);
+ extern long user_trace_start(void);
+ extern long user_trace_stop(void);
+ extern void trace_cmdline(void);
+ extern void init_tracer(void);
+#else
+# define mcount_enabled 0
+# define trace_enabled 0
+# define syscall_tracing 0
+# define stackframe_tracing 0
+# define trace_user_triggered 0
+# define trace_freerunning 0
+# define trace_all_cpus 0
+# define trace_verbose 0
+# define trace_special(v1,v2,v3) do { } while (0)
+# define trace_special_pid(pid,v1,v2) do { } while (0)
+# define trace_special_u64(v1,v2) do { } while (0)
+# define trace_special_sym() do { } while (0)
+# define stop_trace() do { } while (0)
+# define start_trace() do { } while (0)
+# define print_last_trace() do { } while (0)
+# define nmi_trace(eip, parent_eip, flags) do { } while (0)
+# define user_trace_start() do { } while (0)
+# define user_trace_stop() do { } while (0)
+# define trace_cmdline() do { } while (0)
+# define init_tracer() do { } while (0)
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+ extern int wakeup_timing;
+ extern void __trace_start_sched_wakeup(struct task_struct *p);
+ extern void trace_stop_sched_switched(struct task_struct *p);
+ extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p) do { } while (0)
+# define trace_stop_sched_switched(p) do { } while (0)
+# define trace_change_sched_cpu(p, cpu) do { } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+ extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1) do { } while (0)
+# define time_hardirqs_off(a0, a1) do { } while (0)
+#endif

/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
@@ -1014,6 +1114,13 @@ struct task_struct {
unsigned int lockdep_recursion;
#endif

+#define MAX_PREEMPT_TRACE 16
+
+#ifdef CONFIG_PREEMPT_TRACE
+ unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+ unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
/* journalling filesystem info */
void *journal_info;

@@ -1636,6 +1743,7 @@ static inline unsigned int task_cpu(cons

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
+ trace_change_sched_cpu(p, cpu);
task_thread_info(p)->cpu = cpu;
}

Index: linux-2.6.22-rc2/init/main.c
===================================================================
--- linux-2.6.22-rc2.orig/init/main.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/init/main.c 2007-05-24 15:57:42.000000000 +0200
@@ -576,6 +576,8 @@ asmlinkage void __init start_kernel(void
if (panic_later)
panic(panic_later, panic_param);

+ init_tracer();
+
lockdep_info();

/*
Index: linux-2.6.22-rc2/kernel/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/kernel/Makefile 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/Makefile 2007-05-24 15:57:42.000000000 +0200
@@ -38,6 +38,11 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_DEBUG_PREEMPT) += latency_trace.o
+obj-$(CONFIG_WAKEUP_TIMING) += latency_trace.o
+obj-$(CONFIG_EVENT_TRACE) += latency_trace.o
+obj-$(CONFIG_CRITICAL_TIMING) += latency_trace.o
+obj-$(CONFIG_LATENCY_HIST) += latency_hist.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
Index: linux-2.6.22-rc2/kernel/fork.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/fork.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/fork.c 2007-05-24 15:57:42.000000000 +0200
@@ -990,7 +990,7 @@ static struct task_struct *copy_process(

rt_mutex_init_task(p);

-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
Index: linux-2.6.22-rc2/kernel/latency_hist.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/kernel/latency_hist.c 2007-05-24 15:57:42.000000000 +0200
@@ -0,0 +1,267 @@
+/*
+ * kernel/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ * Copyright (C) 2005 MontaVista Software, Inc.
+ * Yi Yang <yyang@xxxxxxxxxxxxx>
+ *
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/latency_hist.h>
+#include <linux/calc64.h>
+#include <asm/atomic.h>
+
+typedef struct hist_data_struct {
+ atomic_t hist_mode; /* 0 log, 1 don't log */
+ unsigned long min_lat;
+ unsigned long avg_lat;
+ unsigned long max_lat;
+ unsigned long long beyond_hist_bound_samples;
+ unsigned long long accumulate_lat;
+ unsigned long long total_samples;
+ unsigned long long hist_array[MAX_ENTRY_NUM];
+} hist_data_t;
+
+static struct proc_dir_entry * latency_hist_root = NULL;
+static char * latency_hist_proc_dir_root = "latency_hist";
+
+static char * percpu_proc_name = "CPU";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist);
+static char * interrupt_off_hist_proc_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, preempt_off_hist);
+static char * preempt_off_hist_proc_dir = "preempt_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist);
+static char * wakeup_latency_hist_proc_dir = "wakeup_latency";
+#endif
+
+static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS];
+
+static inline u64 u64_div(u64 x, u64 y)
+{
+ do_div(x, y);
+ return x;
+}
+
+void latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+ hist_data_t * my_hist;
+
+ if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+ || (latency_type > WAKEUP_LATENCY) || (latency < 0))
+ return;
+
+ switch(latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ case INTERRUPT_LATENCY:
+ my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu);
+ break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ case PREEMPT_LATENCY:
+ my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu);
+ break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ case WAKEUP_LATENCY:
+ my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu);
+ break;
+#endif
+ default:
+ return;
+ }
+
+ if (atomic_read(&my_hist->hist_mode) == 0)
+ return;
+
+ if (latency >= MAX_ENTRY_NUM)
+ my_hist->beyond_hist_bound_samples++;
+ else
+ my_hist->hist_array[latency]++;
+
+ if (latency < my_hist->min_lat)
+ my_hist->min_lat = latency;
+ else if (latency > my_hist->max_lat)
+ my_hist->max_lat = latency;
+
+ my_hist->total_samples++;
+ my_hist->accumulate_lat += latency;
+ my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat,
+ my_hist->total_samples);
+ return;
+}
+
+static void *l_start(struct seq_file *m, loff_t * pos)
+{
+ loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+ loff_t index = *pos;
+ hist_data_t *my_hist = (hist_data_t *) m->private;
+
+ if (!index_ptr)
+ return NULL;
+
+ if (index == 0) {
+ atomic_dec(&my_hist->hist_mode);
+ seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+ "#Average latency: %lu microseconds.\n"
+ "#Maximum latency: %lu microseconds.\n"
+ "#Total samples: %llu\n"
+ "#There are %llu samples greater or equal than %d microseconds\n"
+ "#usecs\t%16s\n"
+ , my_hist->min_lat
+ , my_hist->avg_lat
+ , my_hist->max_lat
+ , my_hist->total_samples
+ , my_hist->beyond_hist_bound_samples
+ , MAX_ENTRY_NUM, "samples");
+ }
+ if (index >= MAX_ENTRY_NUM)
+ return NULL;
+
+ *index_ptr = index;
+ return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t * pos)
+{
+ loff_t *index_ptr = p;
+ hist_data_t *my_hist = (hist_data_t *) m->private;
+
+ if (++*pos >= MAX_ENTRY_NUM) {
+ atomic_inc(&my_hist->hist_mode);
+ return NULL;
+ }
+ *index_ptr = *pos;
+ return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+ kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+ int index = *(loff_t *) p;
+ hist_data_t *my_hist = (hist_data_t *) m->private;
+
+ seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+ return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show
+};
+
+static int latency_hist_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *entry_ptr = NULL;
+ int ret, i, j, break_flags = 0;
+ struct seq_file *seq;
+
+ entry_ptr = PDE(file->f_dentry->d_inode);
+ for (i = 0; i < LATENCY_TYPE_NUM; i++) {
+ for (j = 0; j < NR_CPUS; j++) {
+ if (entry[i][j] == NULL)
+ continue;
+ if (entry_ptr->low_ino == entry[i][j]->low_ino) {
+ break_flags = 1;
+ break;
+ }
+ }
+ if (break_flags == 1)
+ break;
+ }
+ ret = seq_open(file, &latency_hist_seq_op);
+ if (break_flags == 1) {
+ seq = (struct seq_file *)file->private_data;
+ seq->private = entry[i][j]->data;
+ }
+ return ret;
+}
+
+static struct file_operations latency_hist_seq_fops = {
+ .open = latency_hist_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static __init int latency_hist_init(void)
+{
+ struct proc_dir_entry *tmp_parent_proc_dir;
+ int i = 0, len = 0;
+ hist_data_t *my_hist;
+ char procname[64];
+
+ latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL);
+
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root);
+ for (i = 0; i < NR_CPUS; i++) {
+ len = sprintf(procname, "%s%d", percpu_proc_name, i);
+ procname[len] = '\0';
+ entry[INTERRUPT_LATENCY][i] =
+ create_proc_entry(procname, 0, tmp_parent_proc_dir);
+ entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i);
+ entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+ my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data;
+ atomic_set(&my_hist->hist_mode,1);
+ my_hist->min_lat = 0xFFFFFFFFUL;
+ }
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root);
+ for (i = 0; i < NR_CPUS; i++) {
+ len = sprintf(procname, "%s%d", percpu_proc_name, i);
+ procname[len] = '\0';
+ entry[PREEMPT_LATENCY][i] =
+ create_proc_entry(procname, 0, tmp_parent_proc_dir);
+ entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i);
+ entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+ my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data;
+ atomic_set(&my_hist->hist_mode,1);
+ my_hist->min_lat = 0xFFFFFFFFUL;
+ }
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root);
+ for (i = 0; i < NR_CPUS; i++) {
+ len = sprintf(procname, "%s%d", percpu_proc_name, i);
+ procname[len] = '\0';
+ entry[WAKEUP_LATENCY][i] =
+ create_proc_entry(procname, 0, tmp_parent_proc_dir);
+ entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i);
+ entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+ my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data;
+ atomic_set(&my_hist->hist_mode,1);
+ my_hist->min_lat = 0xFFFFFFFFUL;
+ }
+#endif
+ return 0;
+
+}
+
+__initcall(latency_hist_init);
+
Index: linux-2.6.22-rc2/kernel/latency_trace.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/kernel/latency_trace.c 2007-05-24 15:57:42.000000000 +0200
@@ -0,0 +1,2763 @@
+/*
+ * kernel/latency_trace.c
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/rtc.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/proc_fs.h>
+#include <linux/latency_hist.h>
+#include <linux/utsrelease.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/rtc.h>
+#include <asm/asm-offsets.h>
+#include <linux/stacktrace.h>
+
+#ifndef DEFINE_RAW_SPINLOCK
+# define DEFINE_RAW_SPINLOCK DEFINE_SPINLOCK
+#endif
+
+#ifndef RAW_SPIN_LOCK_UNLOCKED
+# define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
+#endif
+
+int trace_use_raw_cycles = 0;
+
+#define __raw_spinlock_t raw_spinlock_t
+#define need_resched_delayed() 0
+
+#ifdef CONFIG_EVENT_TRACE
+/*
+ * Convert raw cycles to usecs.
+ * Note: this is not the 'clocksource cycles' value, it's the raw
+ * cycle counter cycles. We use GTOD to timestamp latency start/end
+ * points, but the trace entries inbetween are timestamped with
+ * get_cycles().
+ */
+static unsigned long notrace cycles_to_us(cycle_t delta)
+{
+ if (!trace_use_raw_cycles)
+ return cycles_to_usecs(delta);
+#ifdef CONFIG_X86
+ do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+ delta = mulhwu(tb_to_us, delta);
+#elif defined(CONFIG_ARM)
+ delta = mach_cycles_to_usecs(delta);
+#else
+ #error Implement cycles_to_usecs.
+#endif
+
+ return (unsigned long) delta;
+}
+#endif
+
+static notrace inline cycle_t now(void)
+{
+ if (trace_use_raw_cycles)
+ return get_cycles();
+ return get_monotonic_cycles();
+}
+
+#ifndef irqs_off
+# define irqs_off() 0
+#endif
+
+#ifndef DEBUG_WARN_ON
+static inline int DEBUG_WARN_ON(int cond)
+{
+ WARN_ON(cond);
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+# define irqs_off_preempt_count() preempt_count()
+# else
+# define irqs_off_preempt_count() 0
+# endif
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+ __raw_spinlock_t trace_lock;
+ struct task_struct *task;
+ int cpu;
+ struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+ { trace_lock: __RAW_SPIN_LOCK_UNLOCKED };
+
+int wakeup_timing = 1;
+#endif
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+
+/*
+ * trace_stop_sched_switched must not be called with runqueue locks held!
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp unsigned long max_sequence;
+
+enum trace_type
+{
+ __TRACE_FIRST_TYPE = 0,
+
+ TRACE_FN,
+ TRACE_SPECIAL,
+ TRACE_SPECIAL_PID,
+ TRACE_SPECIAL_U64,
+ TRACE_SPECIAL_SYM,
+ TRACE_CMDLINE,
+ TRACE_SYSCALL,
+ TRACE_SYSRET,
+
+ __TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_NEED_RESCHED = 0x02,
+ TRACE_FLAG_NEED_RESCHED_DELAYED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_IRQS_HARD_OFF = 0x20,
+};
+
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+#ifdef CONFIG_LATENCY_HIST
+unsigned long preempt_max_latency = (cycle_t)0UL;
+#else
+unsigned long preempt_max_latency = (cycle_t)ULONG_MAX;
+#endif
+
+unsigned long preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+ if (latency_hist_flag && !trace_user_triggered)
+ return 1;
+
+ if (preempt_thresh) {
+ if (delta < preempt_thresh)
+ return 0;
+ } else {
+ if (delta <= preempt_max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+#ifdef CONFIG_EVENT_TRACE
+
+/*
+ * Number of per-CPU trace entries:
+ */
+#define MAX_TRACE (65536UL*16UL)
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+ char type;
+ char cpu;
+ char flags;
+ char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+ int pid;
+ cycle_t timestamp;
+ union {
+ struct {
+ unsigned long eip;
+ unsigned long parent_eip;
+ } fn;
+ struct {
+ unsigned long eip;
+ unsigned long v1, v2, v3;
+ } special;
+ struct {
+ unsigned char str[CMDLINE_BYTES];
+ } cmdline;
+ struct {
+ unsigned long nr; // highest bit: compat call
+ unsigned long p1, p2, p3;
+ } syscall;
+ struct {
+ unsigned long ret;
+ } sysret;
+ struct {
+ unsigned long __pad3[4];
+ } pad;
+ } u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+ atomic_t disabled;
+ unsigned long trace_idx;
+ cycle_t preempt_timestamp;
+ unsigned long critical_start, critical_end;
+ unsigned long critical_sequence;
+ atomic_t underrun;
+ atomic_t overrun;
+ int early_warning;
+ int latency_type;
+ int cpu;
+
+#ifdef CONFIG_EVENT_TRACE
+ struct trace_entry *trace;
+ char comm[CMDLINE_BYTES];
+ pid_t pid;
+ unsigned long uid;
+ unsigned long nice;
+ unsigned long policy;
+ unsigned long rt_priority;
+ unsigned long saved_latency;
+#endif
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ unsigned long stack_check;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp =
+{ [0 ... NR_CPUS-1] = {
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ .stack_check = 1
+#endif
+ } };
+
+#ifdef CONFIG_EVENT_TRACE
+
+int trace_enabled = 0;
+int syscall_tracing = 1;
+int stackframe_tracing = 0;
+int mcount_enabled = 0;
+int trace_freerunning = 0;
+int trace_print_on_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+int print_functions = 0;
+int trace_all_runnable = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+int trace_user_trigger_irq = -1;
+
+void trace_start_ht_debug(void)
+{
+ trace_all_cpus = 0;
+ trace_freerunning = 1;
+ trace_user_triggered = 1;
+ mcount_enabled = 1;
+ trace_enabled = 1;
+ user_trace_start();
+}
+
+struct saved_trace_struct {
+ int cpu;
+ cycle_t first_timestamp, last_timestamp;
+ struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+static void notrace printk_name(unsigned long eip)
+{
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long size, offset;
+ const char *sym_name;
+ char *modname;
+
+ sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+ if (sym_name)
+ printk("%s+%#lx/%#lx", sym_name, offset, size);
+ else
+ printk("<%08lx>", eip);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+#ifndef STACK_WARN
+# define STACK_WARN (THREAD_SIZE/8)
+#endif
+
+#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN)
+#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info))
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER)
+# define PRINT_EXACT_STACKFRAME
+#endif
+
+#ifdef PRINT_EXACT_STACKFRAME
+static unsigned long *worst_stack_bp;
+#endif
+static DEFINE_RAW_SPINLOCK(worst_stack_lock);
+unsigned long worst_stack_left = THREAD_SIZE;
+static unsigned long worst_stack_printed = THREAD_SIZE;
+static char worst_stack_comm[TASK_COMM_LEN+1];
+static int worst_stack_pid;
+static unsigned long worst_stack_sp;
+static char worst_stack[THREAD_SIZE];
+
+static notrace void fill_worst_stack(unsigned long stack_left)
+{
+ unsigned long flags;
+
+ /*
+ * On x64, we must not read the PDA during early bootup:
+ */
+#ifdef CONFIG_X86_64
+ if (system_state == SYSTEM_BOOTING)
+ return;
+#endif
+ spin_lock_irqsave(&worst_stack_lock, flags);
+ if (likely(stack_left < worst_stack_left)) {
+ worst_stack_left = stack_left;
+ memcpy(worst_stack, current_thread_info(), THREAD_SIZE);
+ worst_stack_sp = (unsigned long)&stack_left;
+ memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN);
+ worst_stack_pid = current->pid;
+#ifdef PRINT_EXACT_STACKFRAME
+# ifdef __i386__
+ asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp));
+# elif defined(__x86_64__)
+ asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp));
+# else
+# error Poke the author of above asm code lines !
+# endif
+#endif
+ }
+ spin_unlock_irqrestore(&worst_stack_lock, flags);
+}
+
+#ifdef PRINT_EXACT_STACKFRAME
+
+/*
+ * This takes a BP offset to point the BP back into the saved stack,
+ * the original stack might be long gone (but the stackframe within
+ * the saved copy still contains references to it).
+ */
+#define CONVERT_TO_SAVED_STACK(bp) \
+ ((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1)))
+
+static void show_stackframe(void)
+{
+ unsigned long addr, frame_size, *bp, *prev_bp, sum = 0;
+
+ bp = CONVERT_TO_SAVED_STACK(worst_stack_bp);
+
+ while (bp[0]) {
+ addr = bp[1];
+ if (!kernel_text_address(addr))
+ break;
+
+ prev_bp = bp;
+ bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]);
+
+ frame_size = (bp - prev_bp) * sizeof(long);
+
+ if (frame_size < THREAD_SIZE) {
+ printk("{ %4ld} ", frame_size);
+ sum += frame_size;
+ } else
+ printk("{=%4ld} ", sum);
+
+ printk("[<%08lx>] ", addr);
+ printk_name(addr);
+ printk("\n");
+ }
+}
+
+#else
+
+static inline int valid_stack_ptr(void *p)
+{
+ return p > (void *)worst_stack &&
+ p < (void *)worst_stack + THREAD_SIZE - 3;
+}
+
+static void show_stackframe(void)
+{
+ unsigned long prev_frame, addr;
+ unsigned long *stack;
+
+ prev_frame = (unsigned long)(worst_stack +
+ (worst_stack_sp & (THREAD_SIZE-1)));
+ stack = (unsigned long *)prev_frame;
+
+ while (valid_stack_ptr(stack)) {
+ addr = *stack++;
+ if (__kernel_text_address(addr)) {
+ printk("(%4ld) ", (unsigned long)stack - prev_frame);
+ printk("[<%08lx>] ", addr);
+ print_symbol("%s\n", addr);
+ prev_frame = (unsigned long)stack;
+ }
+ if ((char *)stack >= worst_stack + THREAD_SIZE)
+ break;
+ }
+}
+
+#endif
+
+static notrace void __print_worst_stack(void)
+{
+ unsigned long fill_ratio;
+ printk("----------------------------->\n");
+ printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n",
+ worst_stack_comm, worst_stack_pid,
+ MAX_STACK-worst_stack_left, (long)MAX_STACK);
+ fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK;
+ printk("| Stack fill ratio: %02ld%%", fill_ratio);
+ if (fill_ratio >= 90)
+ printk(" - BUG: that's quite high, please report this!\n");
+ else
+ printk(" - that's still OK, no need to report this.\n");
+ printk("------------|\n");
+
+ show_stackframe();
+ printk("<---------------------------\n\n");
+}
+
+static notrace void print_worst_stack(void)
+{
+ unsigned long flags;
+
+ if (irqs_disabled() || preempt_count())
+ return;
+
+ spin_lock_irqsave(&worst_stack_lock, flags);
+ if (worst_stack_printed == worst_stack_left) {
+ spin_unlock_irqrestore(&worst_stack_lock, flags);
+ return;
+ }
+ worst_stack_printed = worst_stack_left;
+ spin_unlock_irqrestore(&worst_stack_lock, flags);
+
+ __print_worst_stack();
+}
+
+static notrace void debug_stackoverflow(struct cpu_trace *tr)
+{
+ long stack_left;
+
+ if (unlikely(tr->stack_check <= 0))
+ return;
+ atomic_inc(&tr->disabled);
+
+ /* Debugging check for stack overflow: is there less than 1KB free? */
+#ifdef __i386__
+ __asm__ __volatile__("and %%esp,%0" :
+ "=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#elif defined(__x86_64__)
+ __asm__ __volatile__("and %%rsp,%0" :
+ "=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#else
+# error Poke the author of above asm code lines !
+#endif
+ if (unlikely(stack_left < MIN_STACK_NEEDED)) {
+ tr->stack_check = 0;
+ printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n",
+ stack_left - sizeof(struct thread_info),
+ (long)&stack_left,
+ (long)current_thread_info(),
+ (long)current_thread_info() + THREAD_SIZE);
+ fill_worst_stack(stack_left);
+ __print_worst_stack();
+ goto out;
+ }
+ if (unlikely(stack_left < worst_stack_left)) {
+ tr->stack_check--;
+ fill_worst_stack(stack_left);
+ print_worst_stack();
+ tr->stack_check++;
+ } else
+ if (worst_stack_printed != worst_stack_left) {
+ tr->stack_check--;
+ print_worst_stack();
+ tr->stack_check++;
+ }
+out:
+ atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void notrace early_printk_name(unsigned long eip)
+{
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long size, offset;
+ const char *sym_name;
+ char *modname;
+
+ sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+ if (sym_name)
+ early_printk("%s <%08lx>", sym_name, eip);
+ else
+ early_printk("<%08lx>", eip);
+}
+
+static __raw_spinlock_t early_print_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
+static void notrace early_print_entry(struct trace_entry *entry)
+{
+ int hardirq, softirq;
+
+ __raw_spin_lock(&early_print_lock);
+ early_printk("%-5d ", entry->pid);
+
+ early_printk("%d%c%c",
+ entry->cpu,
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' :
+ ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ if (hardirq && softirq)
+ early_printk("H");
+ else {
+ if (hardirq)
+ early_printk("h");
+ else {
+ if (softirq)
+ early_printk("s");
+ else
+ early_printk(".");
+ }
+ }
+
+ early_printk(":%d: ", entry->preempt_count);
+
+ if (entry->type == TRACE_FN) {
+ early_printk_name(entry->u.fn.eip);
+ early_printk(" <= (");
+ early_printk_name(entry->u.fn.parent_eip);
+ early_printk(")\n");
+ } else {
+ /* special entries: */
+ early_printk_name(entry->u.special.eip);
+ early_printk(": <%08lx> <%08lx> <%08lx>\n",
+ entry->u.special.v1,
+ entry->u.special.v2,
+ entry->u.special.v3);
+ }
+ __raw_spin_unlock(&early_print_lock);
+}
+#else
+# define early_print_entry(x) do { } while(0)
+#endif
+
+static void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+ unsigned long eip, unsigned long parent_eip,
+ unsigned long v1, unsigned long v2, unsigned long v3,
+ unsigned long flags)
+{
+ struct trace_entry *entry;
+ unsigned long idx, idx_next;
+ cycle_t timestamp;
+ u32 pc;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+// WARN_ON(!atomic_read(&tr->disabled));
+#endif
+ if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus &&
+ !trace_print_on_crash && !print_functions)
+ goto out;
+ /*
+ * Allocate the next index. Make sure an NMI (or interrupt)
+ * has not taken it away. Potentially redo the timestamp as
+ * well to make sure the trace timestamps are in chronologic
+ * order.
+ */
+again:
+ idx = tr->trace_idx;
+ idx_next = idx + 1;
+ timestamp = now();
+
+ if (unlikely((trace_freerunning || print_functions || atomic_read(&tr->underrun)) &&
+ (idx_next >= MAX_TRACE) && !atomic_read(&tr->overrun))) {
+ atomic_inc(&tr->underrun);
+ idx_next = 0;
+ }
+ if (unlikely(idx >= MAX_TRACE)) {
+ atomic_inc(&tr->overrun);
+ goto out;
+ }
+#ifdef __HAVE_ARCH_CMPXCHG
+ if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx)) {
+ if (idx_next == 0)
+ atomic_dec(&tr->underrun);
+ goto again;
+ }
+#else
+# ifdef CONFIG_SMP
+# error CMPXCHG missing
+# else
+ /* No worry, we are protected by the atomic_incr(&tr->disabled)
+ * in __trace further down
+ */
+ tr->trace_idx = idx_next;
+# endif
+#endif
+ if (unlikely(idx_next != 0 && atomic_read(&tr->underrun)))
+ atomic_inc(&tr->underrun);
+
+ pc = preempt_count();
+
+ if (unlikely(!tr->trace))
+ goto out;
+ entry = tr->trace + idx;
+ entry->type = type;
+#ifdef CONFIG_SMP
+ entry->cpu = cpu;
+#endif
+ entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) |
+ (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)|
+ ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+ ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (need_resched_delayed() ? TRACE_FLAG_NEED_RESCHED_DELAYED : 0);
+ entry->preempt_count = pc & 0xff;
+ entry->pid = current->pid;
+ entry->timestamp = timestamp;
+
+ switch (type) {
+ case TRACE_FN:
+ entry->u.fn.eip = eip;
+ entry->u.fn.parent_eip = parent_eip;
+ if (unlikely(print_functions && !in_interrupt()))
+ early_print_entry(entry);
+ break;
+ case TRACE_SPECIAL:
+ case TRACE_SPECIAL_PID:
+ case TRACE_SPECIAL_U64:
+ case TRACE_SPECIAL_SYM:
+ entry->u.special.eip = eip;
+ entry->u.special.v1 = v1;
+ entry->u.special.v2 = v2;
+ entry->u.special.v3 = v3;
+ if (unlikely(print_functions && !in_interrupt()))
+ early_print_entry(entry);
+ break;
+ case TRACE_SYSCALL:
+ entry->u.syscall.nr = eip;
+ entry->u.syscall.p1 = v1;
+ entry->u.syscall.p2 = v2;
+ entry->u.syscall.p3 = v3;
+ break;
+ case TRACE_SYSRET:
+ entry->u.sysret.ret = eip;
+ break;
+ case TRACE_CMDLINE:
+ memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+ break;
+ default:
+ break;
+ }
+out:
+ ;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+ unsigned long v1, unsigned long v2,
+ unsigned long v3)
+{
+ struct cpu_trace *tr;
+ unsigned long flags;
+ int cpu;
+
+ if (unlikely(trace_enabled <= 0))
+ return;
+
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86)
+ debug_stackoverflow(cpu_traces + raw_smp_processor_id());
+#endif
+
+ raw_local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ /*
+ * Trace on the CPU where the current highest-prio task
+ * is waiting to become runnable:
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+ if (wakeup_timing && !trace_all_cpus && !trace_print_on_crash &&
+ !print_functions) {
+ if (!sch.tr || cpu != sch.cpu)
+ goto out;
+ tr = sch.tr;
+ } else
+ tr = cpu_traces + cpu;
+#else
+ tr = cpu_traces + cpu;
+#endif
+ atomic_inc(&tr->disabled);
+ if (likely(atomic_read(&tr->disabled) == 1)) {
+//#define DEBUG_STACK_POISON
+#ifdef DEBUG_STACK_POISON
+ char stack;
+
+ memset(&stack - 128, 0x34, 128);
+#endif
+ ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags);
+ }
+ atomic_dec(&tr->disabled);
+#ifdef CONFIG_WAKEUP_TIMING
+out:
+#endif
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+ ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+ ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+void notrace trace_special_u64(unsigned long long v1, unsigned long v2)
+{
+ ___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0,
+ (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF),
+ v2);
+}
+
+EXPORT_SYMBOL(trace_special_u64);
+
+void notrace trace_special_sym(void)
+{
+#define STACK_ENTRIES 8
+ unsigned long entries[STACK_ENTRIES];
+ struct stack_trace trace;
+
+ if (!trace_enabled)
+ return;
+
+ if (!stackframe_tracing)
+ return ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, CALLER_ADDR1, 0, 0);
+
+ trace.entries = entries;
+ trace.skip = 3;
+#if 0
+ trace.all_contexts = 1;
+#endif
+ trace.max_entries = STACK_ENTRIES;
+ trace.nr_entries = 0;
+
+#if 0
+ save_stack_trace(&trace, NULL);
+#else
+ save_stack_trace(&trace);
+#endif
+ /*
+ * clear out the rest:
+ */
+ while (trace.nr_entries < trace.max_entries)
+ entries[trace.nr_entries++] = 0;
+
+ ___trace(TRACE_SPECIAL_SYM, entries[0], 0,
+ entries[1], entries[2], entries[3]);
+ ___trace(TRACE_SPECIAL_SYM, entries[4], 0,
+ entries[5], entries[6], entries[7]);
+}
+
+EXPORT_SYMBOL(trace_special_sym);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+ ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+#ifdef CONFIG_MCOUNT
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+ ___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+#endif
+
+void notrace
+sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+ if (syscall_tracing)
+ ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+
+void notrace
+sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2,
+ unsigned long p3)
+{
+ if (syscall_tracing)
+ ___trace(TRACE_SYSCALL, nr | 0x80000000, 0, p1, p2, p3);
+}
+
+#endif
+
+void notrace sys_ret(unsigned long ret)
+{
+ if (syscall_tracing)
+ ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long size, offset;
+ const char *sym_name;
+ char *modname;
+
+ /*
+ * Special trace values:
+ */
+ if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+ seq_printf(m, "(%5ld)", eip);
+ return;
+ }
+ sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+ if (sym_name)
+ seq_puts(m, sym_name);
+ else
+ seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long size, offset;
+ const char *sym_name;
+ char *modname;
+
+ sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+ if (sym_name)
+ seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+ sym_name, offset, size, eip);
+ else
+ seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned long out_sequence = -1;
+
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+ ____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags);
+}
+
+void notrace trace_cmdline(void)
+{
+ ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(struct cpu_trace *tr)
+{
+ unsigned int i, j, entries, pid;
+
+ if (tr->critical_sequence == out_sequence)
+ return;
+ out_sequence = tr->critical_sequence;
+
+ memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+ if (!tr->trace)
+ return;
+
+ entries = min(tr->trace_idx, MAX_TRACE);
+
+ for (i = 0; i < entries; i++) {
+ struct trace_entry *entry = tr->trace + i;
+
+ if (entry->type != TRACE_CMDLINE)
+ continue;
+ pid = entry->pid;
+ if (pid < PID_MAX_DEFAULT) {
+ pid_to_cmdline_array[pid] = i;
+ /*
+ * Replace space with underline - makes it easier
+ * to process for tools:
+ */
+ for (j = 0; j < CMDLINE_BYTES; j++)
+ if (entry->u.cmdline.str[j] == ' ')
+ entry->u.cmdline.str[j] = '_';
+ }
+ }
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+ struct cpu_trace *tr = out_tr.traces + 0;
+ char *cmdline = "<...>";
+ int idx;
+
+ pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+ if (!pid)
+ return "<idle>";
+
+ if (pid_to_cmdline_array[pid] != -1) {
+ idx = pid_to_cmdline_array[pid];
+ if (tr->trace[idx].type == TRACE_CMDLINE)
+ cmdline = tr->trace[idx].u.cmdline.str;
+ }
+ return cmdline;
+}
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr, int reorder)
+{
+ if (!save->trace || !tr->trace)
+ return;
+ /* free-running needs reordering */
+ if (reorder && atomic_read(&tr->underrun)) {
+ int i, idx, idx0 = tr->trace_idx;
+
+ for (i = 0; i < MAX_TRACE; i++) {
+ idx = (idx0 + i) % MAX_TRACE;
+ save->trace[i] = tr->trace[idx];
+ }
+ save->trace_idx = MAX_TRACE;
+ } else {
+ save->trace_idx = tr->trace_idx;
+
+ memcpy(save->trace, tr->trace,
+ min(save->trace_idx, MAX_TRACE) *
+ sizeof(struct trace_entry));
+ }
+ save->underrun = tr->underrun;
+ save->overrun = tr->overrun;
+}
+
+
+struct block_idx {
+ int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+ cycle_t min_stamp = (cycle_t) -1;
+ struct trace_entry *entry;
+ int cpu, min_cpu = -1, idx;
+
+ for_each_online_cpu(cpu) {
+ idx = bidx->idx[cpu];
+ if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE))
+ continue;
+ if (idx > MAX_TRACE*NR_CPUS) {
+ printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE,
+ NR_CPUS);
+ WARN_ON(1);
+ break;
+ }
+ entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+ if (entry->timestamp < min_stamp) {
+ min_cpu = cpu;
+ min_stamp = entry->timestamp;
+ }
+ }
+
+ return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+ struct trace_entry *out_entry, *entry, *tmp;
+ cycle_t stamp, first_stamp, last_stamp;
+ struct block_idx bidx = { { 0, }, };
+ struct cpu_trace *tmp_max, *tmp_out;
+ int cpu, sum, entries, underrun_sum, overrun_sum;
+
+ /*
+ * For out_tr we only have the first array's trace entries
+ * allocated - and they have are larger on SMP to make room
+ * for all trace entries from all CPUs.
+ */
+ tmp_out = out_tr.traces + 0;
+ tmp_max = max_tr.traces + max_tr.cpu;
+ /*
+ * Easier to copy this way. Note: the trace buffer is private
+ * to the output buffer, so preserve it:
+ */
+ copy_trace(tmp_out, tmp_max, 0);
+ tmp = tmp_out->trace;
+ *tmp_out = *tmp_max;
+ tmp_out->trace = tmp;
+
+ out_tr.cpu = max_tr.cpu;
+
+ if (!tmp_out->trace)
+ return;
+
+ out_entry = tmp_out->trace + 0;
+
+ if (!trace_all_cpus) {
+ entries = min(tmp_out->trace_idx, MAX_TRACE);
+ if (!entries)
+ return;
+ out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+ out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+ return;
+ }
+ /*
+ * Find the range of timestamps that are fully traced in
+ * all CPU traces. (since CPU traces can cover a variable
+ * range of time, we have to find the best range.)
+ */
+ first_stamp = 0;
+ for_each_online_cpu(cpu) {
+ tmp_max = max_tr.traces + cpu;
+ stamp = tmp_max->trace[0].timestamp;
+ if (stamp > first_stamp)
+ first_stamp = stamp;
+ }
+ /*
+ * Save the timestamp range:
+ */
+ tmp_max = max_tr.traces + max_tr.cpu;
+ entries = min(tmp_max->trace_idx, MAX_TRACE);
+ /*
+ * No saved trace yet?
+ */
+ if (!entries) {
+ out_tr.traces[0].trace_idx = 0;
+ return;
+ }
+
+ last_stamp = tmp_max->trace[entries-1].timestamp;
+
+ if (last_stamp < first_stamp) {
+ WARN_ON(1);
+
+ for_each_online_cpu(cpu) {
+ tmp_max = max_tr.traces + cpu;
+ entries = min(tmp_max->trace_idx, MAX_TRACE);
+ printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n",
+ cpu,
+ tmp_max->trace[0].timestamp,
+ tmp_max->trace[1].timestamp,
+ entries,
+ tmp_max->trace[entries-2].timestamp,
+ tmp_max->trace[entries-1].timestamp);
+ }
+ tmp_max = max_tr.traces + max_tr.cpu;
+ entries = min(tmp_max->trace_idx, MAX_TRACE);
+
+ printk("CPU%d entries: %d\n", max_tr.cpu, entries);
+ printk("first stamp: %016Lx\n", first_stamp);
+ printk(" last stamp: %016Lx\n", first_stamp);
+ }
+
+#if 0
+ printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp);
+ printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp);
+ printk(" +1 stamp: %Ld [%016Lx]\n",
+ tmp_max->trace[entries].timestamp,
+ tmp_max->trace[entries].timestamp);
+ printk(" +2 stamp: %Ld [%016Lx]\n",
+ tmp_max->trace[entries+1].timestamp,
+ tmp_max->trace[entries+1].timestamp);
+ printk(" delta: %Ld\n", last_stamp-first_stamp);
+ printk(" entries: %d\n", entries);
+#endif
+
+ out_tr.first_timestamp = first_stamp;
+ out_tr.last_timestamp = last_stamp;
+
+ /*
+ * Fetch trace entries one by one, in increasing timestamp
+ * order. Start at first_stamp, stop at last_stamp:
+ */
+ sum = 0;
+ for (;;) {
+ cpu = min_idx(&bidx);
+ if (cpu == -1)
+ break;
+ entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+ if (entry->timestamp > last_stamp)
+ break;
+
+ bidx.idx[cpu]++;
+ if (entry->timestamp < first_stamp)
+ continue;
+ *out_entry = *entry;
+ out_entry++;
+ sum++;
+ if (sum > MAX_TRACE*NR_CPUS) {
+ printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE,
+ NR_CPUS);
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ sum = 0;
+ underrun_sum = 0;
+ overrun_sum = 0;
+ for_each_online_cpu(cpu) {
+ sum += max_tr.traces[cpu].trace_idx;
+ underrun_sum += atomic_read(&max_tr.traces[cpu].underrun);
+ overrun_sum += atomic_read(&max_tr.traces[cpu].overrun);
+ }
+ tmp_out->trace_idx = sum;
+ atomic_set(&tmp_out->underrun, underrun_sum);
+ atomic_set(&tmp_out->overrun, overrun_sum);
+}
+
+static void notrace print_help_header(struct seq_file *m)
+{
+ seq_puts(m, " _------=> CPU# \n");
+ seq_puts(m, " / _-----=> irqs-off \n");
+ seq_puts(m, " | / _----=> need-resched \n");
+ seq_puts(m, " || / _---=> hardirq/softirq \n");
+ seq_puts(m, " ||| / _--=> preempt-depth \n");
+ seq_puts(m, " |||| / \n");
+ seq_puts(m, " ||||| delay \n");
+ seq_puts(m, " cmd pid ||||| time | caller \n");
+ seq_puts(m, " \\ / ||||| \\ | / \n");
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+ unsigned long entries;
+ struct cpu_trace *tr = out_tr.traces + 0;
+
+ down(&out_mutex);
+ /*
+ * if the file is being read newly, update the output trace:
+ */
+ if (!n) {
+ // TODO: use the sequence counter here to optimize
+ down(&max_mutex);
+ update_out_trace();
+ up(&max_mutex);
+#if 0
+ if (!tr->trace_idx) {
+ up(&out_mutex);
+ return NULL;
+ }
+#endif
+ construct_pid_to_cmdline(tr);
+ }
+ entries = min(tr->trace_idx, MAX_TRACE);
+
+ if (!n) {
+ seq_printf(m, "preemption latency trace v1.1.5 on %s\n",
+ UTS_RELEASE);
+ seq_puts(m, "--------------------------------------------------------------------\n");
+ seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+ cycles_to_usecs(tr->saved_latency),
+ entries,
+ (entries + atomic_read(&tr->underrun) +
+ atomic_read(&tr->overrun)),
+ out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+ "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+ "desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+ "preempt",
+#else
+ "rt",
+#endif
+ 0, 0,
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+ softirq_preemption
+#else
+ 0
+#endif
+ ,
+#ifdef CONFIG_PREEMPT_HARDIRQS
+ hardirq_preemption
+#else
+ 0
+#endif
+ );
+#ifdef CONFIG_SMP
+ seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+ seq_puts(m, ")\n");
+#endif
+ seq_puts(m, " -----------------\n");
+ seq_printf(m, " | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+ tr->comm, tr->pid, tr->uid, tr->nice,
+ tr->policy, tr->rt_priority);
+ seq_puts(m, " -----------------\n");
+ if (trace_user_triggered) {
+ seq_puts(m, " => started at: ");
+ print_name_offset(m, tr->critical_start);
+ seq_puts(m, "\n => ended at: ");
+ print_name_offset(m, tr->critical_end);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "\n");
+
+ if (!trace_verbose)
+ print_help_header(m);
+ }
+ if (n >= entries || !tr->trace)
+ return NULL;
+
+ return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct cpu_trace *tr = out_tr.traces;
+ unsigned long entries = min(tr->trace_idx, MAX_TRACE);
+
+ WARN_ON(!tr->trace);
+
+ if (++*pos >= entries) {
+ if (*pos == entries)
+ seq_puts(m, "\n\nvim:ft=help\n");
+ return NULL;
+ }
+ return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+ up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+ unsigned long rel_usecs)
+{
+ seq_printf(m, " %4ldus", abs_usecs);
+ if (rel_usecs > 100)
+ seq_puts(m, "!: ");
+ else if (rel_usecs > 1)
+ seq_puts(m, "+: ");
+ else
+ seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+ unsigned long rel_usecs)
+{
+ seq_printf(m, " %4ldus", abs_usecs);
+ if (rel_usecs > 100)
+ seq_putc(m, '!');
+ else if (rel_usecs > 1)
+ seq_putc(m, '+');
+ else
+ seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+ int hardirq, softirq;
+
+ seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+ seq_printf(m, "%d", entry->cpu);
+ seq_printf(m, "%c%c",
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' :
+ ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ if (hardirq && softirq)
+ seq_putc(m, 'H');
+ else {
+ if (hardirq)
+ seq_putc(m, 'h');
+ else {
+ if (softirq)
+ seq_putc(m, 's');
+ else
+ seq_putc(m, '.');
+ }
+ }
+
+ if (entry->preempt_count)
+ seq_printf(m, "%x", entry->preempt_count);
+ else
+ seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry)
+{
+ unsigned long abs_usecs, rel_usecs;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ if (trace_verbose) {
+ seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ",
+ pid_to_cmdline(entry->pid),
+ entry->pid, entry->cpu, entry->flags,
+ entry->preempt_count, trace_idx,
+ entry->timestamp, abs_usecs/1000,
+ abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+ print_name_offset(m, entry->u.fn.eip);
+ seq_puts(m, " (");
+ print_name_offset(m, entry->u.fn.parent_eip);
+ seq_puts(m, ")\n");
+ } else {
+ print_generic(m, entry);
+ print_timestamp(m, abs_usecs, rel_usecs);
+ print_name(m, entry->u.fn.eip);
+ seq_puts(m, " (");
+ print_name(m, entry->u.fn.parent_eip);
+ seq_puts(m, ")\n");
+ }
+ return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry, int mode64)
+{
+ unsigned long abs_usecs, rel_usecs;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ print_generic(m, entry);
+ print_timestamp(m, abs_usecs, rel_usecs);
+ if (trace_verbose)
+ print_name_offset(m, entry->u.special.eip);
+ else
+ print_name(m, entry->u.special.eip);
+
+ if (!mode64) {
+ /*
+ * For convenience, print small numbers in decimal:
+ */
+ if (abs((int)entry->u.special.v1) < 10000)
+ seq_printf(m, " (%5ld ", entry->u.special.v1);
+ else
+ seq_printf(m, " (%lx ", entry->u.special.v1);
+ if (abs((int)entry->u.special.v2) < 10000)
+ seq_printf(m, "%5ld ", entry->u.special.v2);
+ else
+ seq_printf(m, "%lx ", entry->u.special.v2);
+ if (abs((int)entry->u.special.v3) < 10000)
+ seq_printf(m, "%5ld)\n", entry->u.special.v3);
+ else
+ seq_printf(m, "%lx)\n", entry->u.special.v3);
+ } else {
+ seq_printf(m, " (%13Ld %ld)\n",
+ ((u64)entry->u.special.v1 << 32)
+ + (u64)entry->u.special.v2, entry->u.special.v3);
+ }
+ return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry)
+{
+ unsigned long abs_usecs, rel_usecs;
+ unsigned int pid;
+
+ pid = entry->u.special.v1;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ print_generic(m, entry);
+ print_timestamp(m, abs_usecs, rel_usecs);
+ if (trace_verbose)
+ print_name_offset(m, entry->u.special.eip);
+ else
+ print_name(m, entry->u.special.eip);
+ seq_printf(m, " <%.8s-%d> (%ld %ld)\n",
+ pid_to_cmdline(pid), pid,
+ entry->u.special.v2, entry->u.special.v3);
+
+ return 0;
+}
+
+static int notrace
+l_show_special_sym(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry, int mode64)
+{
+ unsigned long abs_usecs, rel_usecs;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ print_generic(m, entry);
+ print_timestamp(m, abs_usecs, rel_usecs);
+ if (trace_verbose)
+ print_name_offset(m, entry->u.special.eip);
+ else
+ print_name(m, entry->u.special.eip);
+
+ seq_puts(m, "()<-");
+ print_name(m, entry->u.special.v1);
+ seq_puts(m, "()<-");
+ print_name(m, entry->u.special.v2);
+ seq_puts(m, "()<-");
+ print_name(m, entry->u.special.v3);
+ seq_puts(m, "()\n");
+
+ return 0;
+}
+
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry)
+{
+ unsigned long abs_usecs, rel_usecs;
+
+ if (!trace_verbose)
+ return 0;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ seq_printf(m,
+ "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+ entry->u.cmdline.str,
+ abs_usecs/1000, abs_usecs % 1000,
+ rel_usecs/1000, rel_usecs % 1000);
+
+ return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+extern unsigned long ia32_sys_call_table[], ia32_syscall_end[];
+#define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table)
+#endif
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry)
+{
+ unsigned long abs_usecs, rel_usecs;
+ unsigned long nr;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ print_generic(m, entry);
+ print_timestamp_short(m, abs_usecs, rel_usecs);
+
+ seq_puts(m, "> ");
+ nr = entry->u.syscall.nr;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+ if (nr & 0x80000000) {
+ nr &= ~0x80000000;
+ if (nr < IA32_NR_syscalls)
+ print_name(m, ia32_sys_call_table[nr]);
+ else
+ seq_printf(m, "<badsys(%lu)>", nr);
+ } else
+#endif
+ if (nr < NR_syscalls)
+ print_name(m, sys_call_table[nr]);
+ else
+ seq_printf(m, "<badsys(%lu)>", nr);
+
+#ifdef CONFIG_64BIT
+ seq_printf(m, " (%016lx %016lx %016lx)\n",
+ entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+#else
+ seq_printf(m, " (%08lx %08lx %08lx)\n",
+ entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+#endif
+
+ return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+ struct trace_entry *entry, struct trace_entry *entry0,
+ struct trace_entry *next_entry)
+{
+ unsigned long abs_usecs, rel_usecs;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+ rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+ print_generic(m, entry);
+ print_timestamp_short(m, abs_usecs, rel_usecs);
+
+ seq_printf(m, "< (%ld)\n", entry->u.sysret.ret);
+
+ return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+ struct cpu_trace *tr = out_tr.traces;
+ struct trace_entry *entry, *entry0, *next_entry;
+ unsigned long trace_idx;
+
+ cond_resched();
+ entry = p;
+ if (entry->timestamp < out_tr.first_timestamp)
+ return 0;
+ if (entry->timestamp > out_tr.last_timestamp)
+ return 0;
+
+ entry0 = tr->trace;
+ trace_idx = entry - entry0;
+
+ if (trace_idx + 1 < tr->trace_idx)
+ next_entry = entry + 1;
+ else
+ next_entry = entry;
+
+ if (trace_verbose)
+ seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx);
+
+ switch (entry->type) {
+ case TRACE_FN:
+ l_show_fn(m, trace_idx, entry, entry0, next_entry);
+ break;
+ case TRACE_SPECIAL:
+ l_show_special(m, trace_idx, entry, entry0, next_entry, 0);
+ break;
+ case TRACE_SPECIAL_PID:
+ l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+ break;
+ case TRACE_SPECIAL_U64:
+ l_show_special(m, trace_idx, entry, entry0, next_entry, 1);
+ break;
+ case TRACE_SPECIAL_SYM:
+ l_show_special_sym(m, trace_idx, entry, entry0,
+ next_entry, 1);
+ break;
+ case TRACE_CMDLINE:
+ l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+ break;
+ case TRACE_SYSCALL:
+ l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+ break;
+ case TRACE_SYSRET:
+ l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+ break;
+ default:
+ seq_printf(m, "unknown trace type %d\n", entry->type);
+ }
+ return 0;
+}
+
+struct seq_operations latency_trace_op = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show
+};
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct cpu_trace *tr)
+{
+ struct cpu_trace *save;
+ int cpu, all_cpus = 0;
+
+#ifdef CONFIG_PREEMPT
+ WARN_ON(!preempt_count() && !irqs_disabled());
+#endif
+
+ max_tr.cpu = tr->cpu;
+ save = max_tr.traces + tr->cpu;
+
+ if ((wakeup_timing || trace_user_triggered || trace_print_on_crash ||
+ print_functions) && trace_all_cpus) {
+ all_cpus = 1;
+ for_each_online_cpu(cpu)
+ atomic_inc(&cpu_traces[cpu].disabled);
+ }
+
+ save->saved_latency = preempt_max_latency;
+ save->preempt_timestamp = tr->preempt_timestamp;
+ save->critical_start = tr->critical_start;
+ save->critical_end = tr->critical_end;
+ save->critical_sequence = tr->critical_sequence;
+
+ memcpy(save->comm, current->comm, CMDLINE_BYTES);
+ save->pid = current->pid;
+ save->uid = current->uid;
+ save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+ save->policy = current->policy;
+ save->rt_priority = current->rt_priority;
+
+ if (all_cpus) {
+ for_each_online_cpu(cpu) {
+ copy_trace(max_tr.traces + cpu, cpu_traces + cpu, 1);
+ atomic_dec(&cpu_traces[cpu].disabled);
+ }
+ } else
+ copy_trace(save, tr, 1);
+}
+
+#else /* !EVENT_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+ unsigned long eip, unsigned long parent_eip,
+ unsigned long v1, unsigned long v2, unsigned long v3,
+ unsigned long flags)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+ unsigned long v1, unsigned long v2,
+ unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+ int thresh;
+
+ get_option(&s, &thresh);
+ if (thresh > 0) {
+ preempt_thresh = usecs_to_cycles(thresh);
+ printk("Preemption threshold = %u us\n", thresh);
+ }
+ return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr)
+{
+ if (trace_all_cpus)
+ for_each_online_cpu(cpu) {
+ tr = cpu_traces + cpu;
+ tr->trace_idx = 0;
+ atomic_set(&tr->underrun, 0);
+ atomic_set(&tr->overrun, 0);
+ }
+ else{
+ tr->trace_idx = 0;
+ atomic_set(&tr->underrun, 0);
+ atomic_set(&tr->overrun, 0);
+ }
+}
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip)
+{
+ unsigned long latency, t0, t1;
+ cycle_t T0, T1, T2, delta;
+ unsigned long flags;
+
+ if (trace_user_triggered)
+ return;
+ /*
+ * usecs conversion is slow so we try to delay the conversion
+ * as long as possible:
+ */
+ T0 = tr->preempt_timestamp;
+ T1 = get_monotonic_cycles();
+ delta = T1-T0;
+
+ local_save_flags(flags);
+
+ if (!report_latency(delta))
+ goto out;
+
+ ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+ /*
+ * Update the timestamp, because the trace entry above
+ * might change it (it can only get larger so the latency
+ * is fair to be reported):
+ */
+ T2 = get_monotonic_cycles();
+
+ delta = T2-T0;
+
+ latency = cycles_to_usecs(delta);
+ latency_hist(tr->latency_type, cpu, latency);
+
+ if (latency_hist_flag) {
+ if (preempt_max_latency >= delta)
+ goto out;
+ }
+
+ if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+ goto out;
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+ if (!preempt_thresh && preempt_max_latency > delta) {
+ printk("bug: updating %016Lx > %016Lx?\n",
+ preempt_max_latency, delta);
+ printk(" [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+ }
+#endif
+
+ preempt_max_latency = delta;
+ t0 = cycles_to_usecs(T0);
+ t1 = cycles_to_usecs(T1);
+
+ tr->critical_end = parent_eip;
+
+ update_max_tr(tr);
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+ if (preempt_thresh)
+ printk("(%16s-%-5d|#%d): %lu us critical section "
+ "violates %lu us threshold.\n"
+ " => started at timestamp %lu: ",
+ current->comm, current->pid,
+ raw_smp_processor_id(),
+ latency, cycles_to_usecs(preempt_thresh), t0);
+ else
+ printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+ "critical section.\n => started at timestamp %lu: ",
+ current->comm, current->pid,
+ raw_smp_processor_id(),
+ latency, t0);
+
+ print_symbol("<%s>\n", tr->critical_start);
+ printk(" => ended at timestamp %lu: ", t1);
+ print_symbol("<%s>\n", tr->critical_end);
+ dump_stack();
+ t1 = cycles_to_usecs(get_monotonic_cycles());
+ printk(" => dump-end timestamp %lu\n\n", t1);
+#endif
+
+ max_sequence++;
+
+ up(&max_mutex);
+
+out:
+ tr->critical_sequence = max_sequence;
+ tr->preempt_timestamp = get_monotonic_cycles();
+ tr->early_warning = 0;
+ reset_trace_idx(cpu, tr);
+ _trace_cmdline(cpu, tr);
+ ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+}
+
+void notrace touch_critical_timing(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct cpu_trace *tr = cpu_traces + cpu;
+
+ if (!tr->critical_start || atomic_read(&tr->disabled) ||
+ trace_user_triggered || wakeup_timing)
+ return;
+
+ if (preempt_count() > 0 && tr->critical_start) {
+ atomic_inc(&tr->disabled);
+ check_critical_timing(cpu, tr, CALLER_ADDR0);
+ tr->critical_start = CALLER_ADDR0;
+ tr->critical_sequence = max_sequence;
+ atomic_dec(&tr->disabled);
+ }
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+ struct cpu_trace *tr = cpu_traces + raw_smp_processor_id();
+
+ tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip,
+ int latency_type)
+{
+ int cpu = raw_smp_processor_id();
+ struct cpu_trace *tr = cpu_traces + cpu;
+ unsigned long flags;
+
+ if (tr->critical_start || atomic_read(&tr->disabled) ||
+ trace_user_triggered || wakeup_timing)
+ return;
+
+ atomic_inc(&tr->disabled);
+
+ tr->critical_sequence = max_sequence;
+ tr->preempt_timestamp = get_monotonic_cycles();
+ tr->critical_start = eip;
+ reset_trace_idx(cpu, tr);
+ tr->latency_type = latency_type;
+ _trace_cmdline(cpu, tr);
+
+ local_save_flags(flags);
+ ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+
+ atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+ int cpu = raw_smp_processor_id();
+ struct cpu_trace *tr = cpu_traces + cpu;
+ unsigned long flags;
+
+ if (!tr->critical_start || atomic_read(&tr->disabled) ||
+ trace_user_triggered || wakeup_timing)
+ return;
+
+ atomic_inc(&tr->disabled);
+ local_save_flags(flags);
+ ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+ check_critical_timing(cpu, tr, eip);
+ tr->critical_start = 0;
+ atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+#ifdef CONFIG_LOCKDEP
+
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+ __stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+ __start_critical_timing(a0, a1, INTERRUPT_LATENCY);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+/*
+ * Dummy:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+ __stop_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+ __start_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */,
+ INTERRUPT_LATENCY);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+#endif /* !CONFIG_LOCKDEP */
+
+#endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+static inline unsigned long get_parent_eip(void)
+{
+ unsigned long parent_eip = CALLER_ADDR1;
+
+ if (in_lock_functions(parent_eip)) {
+ parent_eip = CALLER_ADDR2;
+ if (in_lock_functions(parent_eip))
+ parent_eip = CALLER_ADDR3;
+ }
+
+ return parent_eip;
+}
+
+void notrace add_preempt_count(unsigned int val)
+{
+ unsigned long eip = CALLER_ADDR0;
+ unsigned long parent_eip = get_parent_eip();
+
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_WARN_ON(((int)preempt_count() < 0)))
+ return;
+ /*
+ * Spinlock count overflowing soon?
+ */
+ if (DEBUG_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10))
+ return;
+#endif
+
+ preempt_count() += val;
+#ifdef CONFIG_PREEMPT_TRACE
+ if (val <= 10) {
+ unsigned int idx = preempt_count() & PREEMPT_MASK;
+ if (idx < MAX_PREEMPT_TRACE) {
+ current->preempt_trace_eip[idx] = eip;
+ current->preempt_trace_parent_eip[idx] = parent_eip;
+ }
+ }
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+ {
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_disabled_flags(flags))
+#endif
+ if (preempt_count() == val)
+ __start_critical_timing(eip, parent_eip,
+ PREEMPT_LATENCY);
+ }
+#endif
+ (void)eip, (void)parent_eip;
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count(unsigned int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_WARN_ON(unlikely(val > preempt_count())))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+ {
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_disabled_flags(flags))
+#endif
+ if (preempt_count() == val)
+ __stop_critical_timing(CALLER_ADDR0,
+ CALLER_ADDR1);
+ }
+#endif
+ preempt_count() -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+void notrace mask_preempt_count(unsigned int mask)
+{
+ unsigned long eip = CALLER_ADDR0;
+ unsigned long parent_eip = get_parent_eip();
+
+ preempt_count() |= mask;
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+ {
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_disabled_flags(flags))
+#endif
+ if (preempt_count() == mask)
+ __start_critical_timing(eip, parent_eip,
+ PREEMPT_LATENCY);
+ }
+#endif
+ (void) eip, (void) parent_eip;
+}
+EXPORT_SYMBOL(mask_preempt_count);
+
+void notrace unmask_preempt_count(unsigned int mask)
+{
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+ {
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ unsigned long flags;
+
+ local_save_flags(flags);
+
+ if (!irqs_disabled_flags(flags))
+#endif
+ if (preempt_count() == mask)
+ __stop_critical_timing(CALLER_ADDR0,
+ CALLER_ADDR1);
+ }
+#endif
+ preempt_count() &= ~mask;
+}
+EXPORT_SYMBOL(unmask_preempt_count);
+
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip,
+ unsigned long *flags)
+{
+ int cpu = raw_smp_processor_id();
+ unsigned long latency, t0, t1;
+ cycle_t T0, T1, delta;
+
+ if (trace_user_triggered)
+ return;
+
+ atomic_inc(&tr->disabled);
+ if (atomic_read(&tr->disabled) != 1)
+ goto out;
+
+ T0 = tr->preempt_timestamp;
+ T1 = get_monotonic_cycles();
+ /*
+ * Any wraparound or time warp and we are out:
+ */
+ if (T0 > T1)
+ goto out;
+ delta = T1-T0;
+
+ if (!report_latency(delta))
+ goto out;
+
+ ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip,
+ 0, 0, 0, *flags);
+
+ latency = cycles_to_usecs(delta);
+ latency_hist(tr->latency_type, cpu, latency);
+
+ if (latency_hist_flag) {
+ if (preempt_max_latency >= delta)
+ goto out;
+ }
+
+ if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+ goto out;
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+ if (!preempt_thresh && preempt_max_latency > delta) {
+ printk("bug2: updating %016lx > %016Lx?\n",
+ preempt_max_latency, delta);
+ printk(" [%016Lx %016Lx]\n", T0, T1);
+ }
+#endif
+
+ preempt_max_latency = delta;
+ t0 = cycles_to_usecs(T0);
+ t1 = cycles_to_usecs(T1);
+ tr->critical_end = parent_eip;
+
+ update_max_tr(tr);
+
+ atomic_dec(&tr->disabled);
+ __raw_spin_unlock(&sch.trace_lock);
+ local_irq_restore(*flags);
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+ if (preempt_thresh)
+ printk("(%16s-%-5d|#%d): %lu us wakeup latency "
+ "violates %lu us threshold.\n",
+ current->comm, current->pid,
+ raw_smp_processor_id(), latency,
+ cycles_to_usecs(preempt_thresh));
+ else
+ printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+ "wakeup.\n", current->comm, current->pid,
+ raw_smp_processor_id(), latency);
+#endif
+
+ max_sequence++;
+
+ up(&max_mutex);
+
+ return;
+
+out:
+ atomic_dec(&tr->disabled);
+ __raw_spin_unlock(&sch.trace_lock);
+ local_irq_restore(*flags);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+ struct cpu_trace *tr;
+ int cpu;
+
+ if (trace_user_triggered || !wakeup_timing) {
+ trace_special_pid(p->pid, p->prio, -1);
+ return;
+ }
+
+ __raw_spin_lock(&sch.trace_lock);
+ if (sch.task && (sch.task->prio <= p->prio))
+ goto out_unlock;
+
+ /*
+ * New highest-prio task just woke up - start tracing:
+ */
+ sch.task = p;
+ cpu = task_cpu(p);
+ sch.cpu = cpu;
+ /*
+ * We keep using this CPU's trace buffer even if the task
+ * gets migrated to another CPU. Tracing only happens on
+ * the CPU that 'owns' the highest-prio task so it's
+ * fundamentally single-threaded.
+ */
+ sch.tr = tr = cpu_traces + cpu;
+ reset_trace_idx(cpu, tr);
+
+// if (!atomic_read(&tr->disabled)) {
+ atomic_inc(&tr->disabled);
+ tr->critical_sequence = max_sequence;
+ tr->preempt_timestamp = get_monotonic_cycles();
+ tr->latency_type = WAKEUP_LATENCY;
+ tr->critical_start = CALLER_ADDR0;
+ _trace_cmdline(raw_smp_processor_id(), tr);
+ atomic_dec(&tr->disabled);
+// }
+
+ mcount();
+ trace_special_pid(p->pid, p->prio, cpu);
+ trace_special_sym();
+out_unlock:
+ __raw_spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+ struct cpu_trace *tr;
+ unsigned long flags;
+
+ if (trace_user_triggered || !wakeup_timing)
+ return;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&sch.trace_lock);
+ if (p == sch.task) {
+ trace_special_pid(p->pid, p->prio, task_cpu(p));
+
+ sch.task = NULL;
+ tr = sch.tr;
+ sch.tr = NULL;
+ WARN_ON(!tr);
+ /* auto-unlocks the spinlock: */
+ check_wakeup_timing(tr, CALLER_ADDR0, &flags);
+ } else {
+ if (sch.task)
+ trace_special_pid(sch.task->pid, sch.task->prio,
+ p->prio);
+ if (sch.task && (sch.task->prio >= p->prio))
+ sch.task = NULL;
+ __raw_spin_unlock(&sch.trace_lock);
+ }
+ local_irq_restore(flags);
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+
+ if (!wakeup_timing)
+ return;
+
+ trace_special_pid(p->pid, task_cpu(p), new_cpu);
+ trace_special_sym();
+ local_irq_save(flags);
+ __raw_spin_lock(&sch.trace_lock);
+ if (p == sch.task && task_cpu(p) != new_cpu) {
+ sch.cpu = new_cpu;
+ trace_special(task_cpu(p), new_cpu, 0);
+ }
+ __raw_spin_unlock(&sch.trace_lock);
+ local_irq_restore(flags);
+}
+
+#endif
+
+#ifdef CONFIG_EVENT_TRACE
+
+long user_trace_start(void)
+{
+ struct cpu_trace *tr;
+ unsigned long flags;
+ int cpu;
+
+ if (!trace_user_triggered || trace_print_on_crash || print_functions)
+ return -EINVAL;
+
+ /*
+ * If the user has not yet reset the max latency after
+ * bootup then we assume that this was the intention
+ * (we wont get any tracing done otherwise):
+ */
+ if (preempt_max_latency == (cycle_t)ULONG_MAX)
+ preempt_max_latency = 0;
+
+ /*
+ * user_trace_start() might be called from hardirq
+ * context, if trace_user_triggered_irq is set, so
+ * be careful about locking:
+ */
+ if (preempt_count() || irqs_disabled()) {
+ if (down_trylock(&max_mutex))
+ return -EAGAIN;
+ } else
+ down(&max_mutex);
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ tr = cpu_traces + cpu;
+
+#ifdef CONFIG_WAKEUP_TIMING
+ if (wakeup_timing) {
+ __raw_spin_lock(&sch.trace_lock);
+ sch.task = current;
+ sch.cpu = cpu;
+ sch.tr = tr;
+ __raw_spin_unlock(&sch.trace_lock);
+ }
+#endif
+ reset_trace_idx(cpu, tr);
+
+ tr->critical_sequence = max_sequence;
+ tr->preempt_timestamp = get_monotonic_cycles();
+ tr->critical_start = CALLER_ADDR0;
+ _trace_cmdline(cpu, tr);
+ mcount();
+
+ WARN_ON(!irqs_disabled());
+ local_irq_restore(flags);
+
+ up(&max_mutex);
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(user_trace_start);
+
+long user_trace_stop(void)
+{
+ unsigned long latency = 0, flags;
+ struct cpu_trace *tr;
+ cycle_t delta;
+
+ if (!trace_user_triggered || trace_print_on_crash || print_functions)
+ return -EINVAL;
+
+ local_irq_save(flags);
+ mcount();
+
+#ifdef CONFIG_WAKEUP_TIMING
+ if (wakeup_timing) {
+ struct task_struct *t;
+
+ __raw_spin_lock(&sch.trace_lock);
+#if 0
+ t = sch.task;
+ if (current != t) {
+ __raw_spin_unlock(&sch.trace_lock);
+ local_irq_restore(flags);
+ printk("wrong stop: curr: %s/%d[%d] => %p\n",
+ current->comm, current->pid,
+ task_thread_info(current)->cpu, t);
+ if (t)
+ printk("wrong stop: curr: %s/%d[%d]\n",
+ t->comm, t->pid, task_thread_info(t)->cpu);
+ return -EINVAL;
+ }
+#endif
+ sch.task = NULL;
+ tr = sch.tr;
+ sch.tr = NULL;
+ __raw_spin_unlock(&sch.trace_lock);
+ } else
+#endif
+ tr = cpu_traces + smp_processor_id();
+
+ atomic_inc(&tr->disabled);
+ if (tr->preempt_timestamp) {
+ cycle_t T0, T1;
+ unsigned long long tmp0;
+
+ T0 = tr->preempt_timestamp;
+ T1 = get_monotonic_cycles();
+ tmp0 = preempt_max_latency;
+ if (T1 < T0)
+ T0 = T1;
+ delta = T1 - T0;
+ if (!report_latency(delta))
+ goto out;
+ if (tr->critical_sequence != max_sequence ||
+ down_trylock(&max_mutex))
+ goto out;
+
+ WARN_ON(!preempt_thresh && preempt_max_latency > delta);
+
+ preempt_max_latency = delta;
+ update_max_tr(tr);
+
+ latency = cycles_to_usecs(delta);
+
+ max_sequence++;
+ up(&max_mutex);
+out:
+ tr->preempt_timestamp = 0;
+ }
+ atomic_dec(&tr->disabled);
+ local_irq_restore(flags);
+
+ if (latency) {
+ if (preempt_thresh)
+ printk("(%16s-%-5d|#%d): %lu us user-latency "
+ "violates %lu us threshold.\n",
+ current->comm, current->pid,
+ raw_smp_processor_id(), latency,
+ cycles_to_usecs(preempt_thresh));
+ else
+ printk("(%16s-%-5d|#%d): new %lu us user-latency.\n",
+ current->comm, current->pid,
+ raw_smp_processor_id(), latency);
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+static int trace_print_cpu = -1;
+
+void notrace stop_trace(void)
+{
+ if (trace_print_on_crash && trace_print_cpu == -1) {
+ trace_enabled = -1;
+ trace_print_cpu = raw_smp_processor_id();
+ }
+}
+
+EXPORT_SYMBOL(stop_trace);
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0)
+{
+ unsigned long abs_usecs;
+ int hardirq, softirq;
+
+ abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+
+ printk("%-5d ", entry->pid);
+
+ printk("%d%c%c",
+ entry->cpu,
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' :
+ ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ if (hardirq && softirq)
+ printk("H");
+ else {
+ if (hardirq)
+ printk("h");
+ else {
+ if (softirq)
+ printk("s");
+ else
+ printk(".");
+ }
+ }
+
+ if (entry->preempt_count)
+ printk(":%x ", entry->preempt_count);
+ else
+ printk(":. ");
+
+ printk("%ld.%03ldms: ", abs_usecs/1000, abs_usecs % 1000);
+
+ switch (entry->type) {
+ case TRACE_FN:
+ printk_name(entry->u.fn.eip);
+ printk(" <= (");
+ printk_name(entry->u.fn.parent_eip);
+ printk(")\n");
+ break;
+ case TRACE_SPECIAL:
+ printk(" special: %lx %lx %lx\n",
+ entry->u.special.v1, entry->u.special.v2,
+ entry->u.special.v3);
+ break;
+ case TRACE_SPECIAL_U64:
+ printk(" spec64: %lx%08lx %lx\n",
+ entry->u.special.v1, entry->u.special.v2,
+ entry->u.special.v3);
+ break;
+ }
+}
+
+/*
+ * Print the current trace at crash time.
+ *
+ * We print it backwards, so that the newest (most interesting) entries
+ * are printed first.
+ */
+void print_last_trace(void)
+{
+ unsigned int idx0, idx, i, cpu;
+ struct cpu_trace *tr;
+ struct trace_entry *entry0, *entry;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ if (trace_enabled != -1 || trace_print_cpu != cpu ||
+ !trace_print_on_crash) {
+ if (trace_print_on_crash)
+ printk("skipping trace printing on CPU#%d != %d\n",
+ cpu, trace_print_cpu);
+ preempt_enable();
+ return;
+ }
+
+ trace_print_on_crash = 0;
+
+ tr = cpu_traces + cpu;
+ if (!tr->trace)
+ goto out;
+
+ printk("Last %ld trace entries:\n", MAX_TRACE);
+ idx0 = tr->trace_idx;
+ printk("curr idx: %d\n", idx0);
+ if (idx0 >= MAX_TRACE)
+ idx0 = 0;
+ idx = idx0;
+ entry0 = tr->trace + idx0;
+
+ for (i = 0; i < MAX_TRACE; i++) {
+ if (idx == 0)
+ idx = MAX_TRACE-1;
+ else
+ idx--;
+ entry = tr->trace + idx;
+ switch (entry->type) {
+ case TRACE_FN:
+ case TRACE_SPECIAL:
+ case TRACE_SPECIAL_U64:
+ print_entry(entry, entry0);
+ break;
+ }
+ }
+ printk("printed %ld entries\n", MAX_TRACE);
+out:
+ preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+ unsigned long flags)
+{
+ int cpu, this_cpu = smp_processor_id();
+
+ __trace(eip, parent_eip);
+
+ nmi_eips[this_cpu] = parent_eip;
+ nmi_flags[this_cpu] = flags;
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu_online(cpu) && cpu != this_cpu) {
+ __trace(eip, nmi_eips[cpu]);
+ __trace(eip, nmi_flags[cpu]);
+ }
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+ unsigned long flags)
+{
+ __trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+ unsigned int count = task_thread_info(task)->preempt_count;
+ unsigned int i, lim = count & PREEMPT_MASK;
+ if (lim >= MAX_PREEMPT_TRACE)
+ lim = MAX_PREEMPT_TRACE-1;
+ printk("---------------------------\n");
+ printk("| preempt count: %08x ]\n", count);
+ printk("| %d-level deep critical section nesting:\n", lim);
+ printk("----------------------------------------\n");
+ for (i = 1; i <= lim; i++) {
+ printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+ print_symbol("%s\n", task->preempt_trace_eip[i]);
+ printk(".....[<%08lx>] .. ( <= ",
+ task->preempt_trace_parent_eip[i]);
+ print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+ }
+ printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE)
+void print_traces(struct task_struct *task)
+{
+ if (!task)
+ task = current;
+
+#ifdef CONFIG_PREEMPT_TRACE
+ print_preempt_trace(task);
+#endif
+#ifdef CONFIG_EVENT_TRACE
+ print_last_trace();
+#endif
+}
+#endif
+
+#ifdef CONFIG_EVENT_TRACE
+/*
+ * Allocate all the per-CPU trace buffers and the
+ * save-maximum/save-output staging buffers:
+ */
+void __init init_tracer(void)
+{
+ unsigned long size, total_size = 0;
+ struct trace_entry *array;
+ struct cpu_trace *tr;
+ int cpu;
+
+ printk("num_possible_cpus(): %d\n", num_possible_cpus());
+
+ size = sizeof(struct trace_entry)*MAX_TRACE;
+
+ for_each_possible_cpu(cpu) {
+ tr = cpu_traces + cpu;
+ array = alloc_bootmem(size);
+ if (!array) {
+ printk(KERN_ERR
+ "CPU#%d: failed to allocate %ld bytes trace buffer!\n",
+ cpu, size);
+ } else {
+ printk(KERN_INFO
+ "CPU#%d: allocated %ld bytes trace buffer.\n",
+ cpu, size);
+ total_size += size;
+ }
+ tr->cpu = cpu;
+ tr->trace = array;
+
+ array = alloc_bootmem(size);
+ if (!array) {
+ printk(KERN_ERR
+ "CPU#%d: failed to allocate %ld bytes max-trace buffer!\n",
+ cpu, size);
+ } else {
+ printk(KERN_INFO
+ "CPU#%d: allocated %ld bytes max-trace buffer.\n",
+ cpu, size);
+ total_size += size;
+ }
+ max_tr.traces[cpu].trace = array;
+ }
+
+ /*
+ * The output trace buffer is a special one that only has
+ * trace entries for the first cpu-trace structure:
+ */
+ size = sizeof(struct trace_entry)*MAX_TRACE*num_possible_cpus();
+ array = alloc_bootmem(size);
+ if (!array) {
+ printk(KERN_ERR
+ "failed to allocate %ld bytes out-trace buffer!\n",
+ size);
+ } else {
+ printk(KERN_INFO "allocated %ld bytes out-trace buffer.\n",
+ size);
+ total_size += size;
+ }
+ out_tr.traces[0].trace = array;
+ printk(KERN_INFO
+ "tracer: a total of %ld bytes allocated.\n",
+ total_size);
+}
+#endif
Index: linux-2.6.22-rc2/kernel/lockdep.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/lockdep.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/lockdep.c 2007-05-24 15:57:42.000000000 +0200
@@ -166,14 +166,14 @@ static struct list_head chainhash_table[
((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
(key2))

-void lockdep_off(void)
+void notrace lockdep_off(void)
{
current->lockdep_recursion++;
}

EXPORT_SYMBOL(lockdep_off);

-void lockdep_on(void)
+void notrace lockdep_on(void)
{
current->lockdep_recursion--;
}
@@ -706,7 +706,7 @@ find_usage_forwards(struct lock_class *s
* Return 1 otherwise and keep <backwards_match> unchanged.
* Return 0 on error.
*/
-static noinline int
+static noinline notrace int
find_usage_backwards(struct lock_class *source, unsigned int depth)
{
struct lock_list *entry;
@@ -1386,7 +1386,7 @@ cache_hit:
* We are building curr_chain_key incrementally, so double-check
* it from scratch, to make sure that it's done correctly:
*/
-static void check_chain_key(struct task_struct *curr)
+static void notrace check_chain_key(struct task_struct *curr)
{
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
@@ -1573,8 +1573,8 @@ valid_state(struct task_struct *curr, st
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
+static int notrace mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
{
unsigned int new_mask = 1 << new_bit, ret = 1;

@@ -1781,6 +1781,7 @@ static int mark_lock(struct task_struct
* We must printk outside of the graph_lock:
*/
if (ret == 2) {
+ user_trace_stop();
printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
print_lock(this);
print_irqtrace_events(curr);
@@ -1794,7 +1795,7 @@ static int mark_lock(struct task_struct
/*
* Mark all held locks with a usage bit:
*/
-static int
+static int notrace
mark_held_locks(struct task_struct *curr, int hardirq)
{
enum lock_usage_bit usage_bit;
@@ -1841,7 +1842,7 @@ void early_boot_irqs_on(void)
/*
* Hardirqs will be enabled:
*/
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on(void)
{
struct task_struct *curr = current;
unsigned long ip;
@@ -1882,6 +1883,9 @@ void trace_hardirqs_on(void)
curr->hardirq_enable_ip = ip;
curr->hardirq_enable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_on_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ time_hardirqs_on(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+#endif
}

EXPORT_SYMBOL(trace_hardirqs_on);
@@ -1889,7 +1893,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off(void)
{
struct task_struct *curr = current;

@@ -1907,6 +1911,9 @@ void trace_hardirqs_off(void)
curr->hardirq_disable_ip = _RET_IP_;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_off_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+ time_hardirqs_off(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+#endif
} else
debug_atomic_inc(&redundant_hardirqs_off);
}
@@ -2404,7 +2411,7 @@ __lock_release(struct lockdep_map *lock,
/*
* Check whether we follow the irq-flags state precisely:
*/
-static void check_flags(unsigned long flags)
+static notrace void check_flags(unsigned long flags)
{
#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
if (!debug_locks)
@@ -2436,8 +2443,9 @@ static void check_flags(unsigned long fl
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
-void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
- int trylock, int read, int check, unsigned long ip)
+void notrace
+lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, unsigned long ip)
{
unsigned long flags;

@@ -2445,9 +2453,9 @@ void lock_acquire(struct lockdep_map *lo
return;

raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
check_flags(flags);

- current->lockdep_recursion = 1;
__lock_acquire(lock, subclass, trylock, read, check,
irqs_disabled_flags(flags), ip);
current->lockdep_recursion = 0;
@@ -2456,7 +2464,8 @@ void lock_acquire(struct lockdep_map *lo

EXPORT_SYMBOL_GPL(lock_acquire);

-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void notrace
+lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
unsigned long flags;

@@ -2464,8 +2473,8 @@ void lock_release(struct lockdep_map *lo
return;

raw_local_irq_save(flags);
- check_flags(flags);
current->lockdep_recursion = 1;
+ check_flags(flags);
__lock_release(lock, nested, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
Index: linux-2.6.22-rc2/kernel/panic.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/panic.c 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/kernel/panic.c 2007-05-24 15:57:42.000000000 +0200
@@ -66,6 +66,8 @@ NORET_TYPE void panic(const char * fmt,
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif

+ stop_trace();
+
/*
* It's possible to come here directly from a panic-assertion and not
* have preempt disabled. Some functions called from here want
Index: linux-2.6.22-rc2/kernel/printk.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/printk.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/printk.c 2007-05-24 15:57:42.000000000 +0200
@@ -324,12 +324,29 @@ static void __call_console_drivers(unsig
{
struct console *con;

+ touch_critical_timing();
for (con = console_drivers; con; con = con->next) {
if ((con->flags & CON_ENABLED) && con->write &&
(cpu_online(smp_processor_id()) ||
- (con->flags & CON_ANYTIME)))
+ (con->flags & CON_ANYTIME))) {
+ /*
+ * Disable tracing of printk details - it just
+ * clobbers the trace output with lots of
+ * repetitive lines (especially if console is
+ * on a serial line):
+ */
+#ifdef CONFIG_EVENT_TRACE
+ int trace_save = trace_enabled;
+
+ trace_enabled = 0;
+ con->write(con, &LOG_BUF(start), end - start);
+ trace_enabled = trace_save;
+#else
con->write(con, &LOG_BUF(start), end - start);
+#endif
+ }
}
+ touch_critical_timing();
}

static int __read_mostly ignore_loglevel;
Index: linux-2.6.22-rc2/kernel/sched.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/sched.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/sched.c 2007-05-24 15:57:42.000000000 +0200
@@ -76,6 +76,10 @@ unsigned long long __attribute__((weak))
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

+#define __PRIO(prio) \
+ ((prio) <= 99 ? 199 - (prio) : (prio) - 120)
+
+#define PRIO(p) __PRIO((p)->prio)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
@@ -1043,6 +1047,12 @@ static void deactivate_task(struct task_
p->array = NULL;
}

+static inline void trace_start_sched_wakeup(struct task_struct *p, struct rq *rq)
+{
+ if (p != rq->curr)
+ __trace_start_sched_wakeup(p);
+}
+
/*
* resched_task - mark a task 'to be rescheduled now'.
*
@@ -1060,6 +1070,8 @@ static void resched_task(struct task_str
{
int cpu;

+ trace_start_sched_wakeup(p, task_rq(p));
+
assert_spin_locked(&task_rq(p)->lock);

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
@@ -1090,6 +1102,8 @@ static void resched_cpu(int cpu)
#else
static inline void resched_task(struct task_struct *p)
{
+ trace_start_sched_wakeup(p, task_rq(p));
+
assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
@@ -1615,14 +1629,19 @@ out:

int fastcall wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+ int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+ mcount();
+ return ret;
}
EXPORT_SYMBOL(wake_up_process);

int fastcall wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ int ret = try_to_wake_up(p, state, 0);
+
+ mcount();
+ return ret;
}

static void task_running_tick(struct rq *rq, struct task_struct *p);
@@ -1860,6 +1879,7 @@ static inline void finish_task_switch(st
prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
+ trace_stop_sched_switched(current);
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
@@ -1930,6 +1950,8 @@ context_switch(struct rq *rq, struct tas
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif

+ trace_cmdline();
+
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);

@@ -3461,41 +3483,39 @@ void scheduler_tick(void)
#endif
}

-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_EVENT_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES)

-void fastcall add_preempt_count(int val)
+static void trace_array(struct prio_array *array)
{
- /*
- * Underflow?
- */
- if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
- return;
- preempt_count() += val;
- /*
- * Spinlock count overflowing soon?
- */
- DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
- PREEMPT_MASK - 10);
+ int i;
+ struct task_struct *p;
+ struct list_head *head, *tmp;
+
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ head = array->queue + i;
+ if (list_empty(head)) {
+ WARN_ON(test_bit(i, array->bitmap));
+ continue;
+ }
+ WARN_ON(!test_bit(i, array->bitmap));
+ list_for_each(tmp, head) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ trace_special_pid(p->pid, p->prio, PRIO(p));
+ }
+ }
}
-EXPORT_SYMBOL(add_preempt_count);

-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(struct rq *rq)
{
- /*
- * Underflow?
- */
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
- return;
- /*
- * Is the spinlock portion underflowing?
- */
- if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
- !(preempt_count() & PREEMPT_MASK)))
- return;
+ if (trace_enabled)
+ trace_array(rq->active);
+}
+
+#else

- preempt_count() -= val;
+static inline void trace_all_runnable_tasks(struct rq *rq)
+{
}
-EXPORT_SYMBOL(sub_preempt_count);

#endif

@@ -3641,6 +3661,8 @@ switch_tasks:
prev->sleep_avg = 0;
prev->timestamp = prev->last_ran = now;

+ trace_all_runnable_tasks(rq);
+
sched_info_switch(prev, next);
if (likely(prev != next)) {
next->timestamp = next->last_ran = now;
@@ -3651,14 +3673,17 @@ switch_tasks:
prepare_task_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
+ trace_special_pid(prev->pid, PRIO(prev), PRIO(current));
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
- } else
+ } else {
spin_unlock_irq(&rq->lock);
+ trace_stop_sched_switched(next);
+ }

prev = current;
if (unlikely(reacquire_kernel_lock(prev) < 0))
@@ -4108,6 +4133,7 @@ void rt_mutex_setprio(struct task_struct
} else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
+
task_rq_unlock(rq, &flags);
}

@@ -7055,6 +7081,7 @@ void __might_sleep(char *file, int line)
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
+ stop_trace();
printk(KERN_ERR "BUG: sleeping function called from invalid"
" context at %s:%d\n", file, line);
printk("in_atomic():%d, irqs_disabled():%d\n",
Index: linux-2.6.22-rc2/kernel/softlockup.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/softlockup.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/softlockup.c 2007-05-24 15:57:42.000000000 +0200
@@ -100,6 +100,7 @@ void softlockup_tick(void)
if (now > (touch_timestamp + 10)) {
per_cpu(print_timestamp, this_cpu) = touch_timestamp;

+ stop_trace();
spin_lock(&print_lock);
printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
this_cpu);
Index: linux-2.6.22-rc2/kernel/sysctl.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/sysctl.c 2007-05-22 16:59:43.000000000 +0200
+++ linux-2.6.22-rc2/kernel/sysctl.c 2007-05-24 15:57:42.000000000 +0200
@@ -29,6 +29,7 @@
#include <linux/utsname.h>
#include <linux/capability.h>
#include <linux/smp_lock.h>
+#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
@@ -43,6 +44,7 @@
#include <linux/limits.h>
#include <linux/dcache.h>
#include <linux/syscalls.h>
+#include <linux/profile.h>
#include <linux/nfs_fs.h>
#include <linux/acpi.h>

@@ -215,6 +217,132 @@ static ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#ifdef CONFIG_WAKEUP_TIMING
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "wakeup_timing",
+ .data = &wakeup_timing,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_WAKEUP_TIMING) || defined(CONFIG_EVENT_TRACE)
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "preempt_max_latency",
+ .data = &preempt_max_latency,
+ .maxlen = sizeof(preempt_max_latency),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "preempt_thresh",
+ .data = &preempt_thresh,
+ .maxlen = sizeof(preempt_thresh),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif
+#ifdef CONFIG_EVENT_TRACE
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_enabled",
+ .data = &trace_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "syscall_tracing",
+ .data = &syscall_tracing,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "stackframe_tracing",
+ .data = &stackframe_tracing,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "mcount_enabled",
+ .data = &mcount_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_user_triggered",
+ .data = &trace_user_triggered,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_user_trigger_irq",
+ .data = &trace_user_trigger_irq,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_freerunning",
+ .data = &trace_freerunning,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_print_on_crash",
+ .data = &trace_print_on_crash,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_verbose",
+ .data = &trace_verbose,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_all_cpus",
+ .data = &trace_all_cpus,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_use_raw_cycles",
+ .data = &trace_use_raw_cycles,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "trace_all_runnable",
+ .data = &trace_all_runnable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{
.ctl_name = KERN_CORE_USES_PID,
.procname = "core_uses_pid",
Index: linux-2.6.22-rc2/lib/Kconfig.debug
===================================================================
--- linux-2.6.22-rc2.orig/lib/Kconfig.debug 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/lib/Kconfig.debug 2007-05-24 15:57:43.000000000 +0200
@@ -296,6 +296,192 @@ config STACKTRACE
depends on DEBUG_KERNEL
depends on STACKTRACE_SUPPORT

+config PREEMPT_TRACE
+ bool
+ default y
+ depends on DEBUG_PREEMPT
+
+config EVENT_TRACE
+ bool "Kernel event tracing"
+ default n
+ depends on GENERIC_TIME
+ select FRAME_POINTER
+ select STACKTRACE
+ help
+ This option enables a kernel tracing mechanism that will track
+ certain kernel events such as system call entry and return,
+ IRQ entry, context-switching, etc.
+
+ Run the scripts/trace-it utility on a kernel with this option
+ enabled for sample output.
+
+config FUNCTION_TRACE
+ bool "Kernel function call tracing"
+ default n
+ depends on !REORDER
+ select EVENT_TRACE
+ help
+ This option enables a kernel tracing mechanism that will track
+ precise function-call granularity kernel execution. Sample
+ output:
+
+ pcscd-1772 0D..2 6867us : deactivate_task <pcscd-1772> (-2 1)
+ pcscd-1772 0D..2 6867us : dequeue_task (deactivate_task)
+ <idle>-0 0D..2 6870us : __switch_to (__schedule)
+ <idle>-0 0D..2 6871us : __schedule <pcscd-1772> (-2 20)
+ <idle>-0 0D..2 6871us : __lock_acquire (lock_acquire)
+ <idle>-0 0D..2 6872us : __spin_unlock_irq (__schedule)
+
+ Run the scripts/trace-it sample utility on a kernel with this
+ option enabled to capture 1 second worth of events.
+
+ (Note that kernel size and overhead increases noticeably
+ with this option enabled.)
+
+config WAKEUP_TIMING
+ bool "Wakeup latency timing"
+ depends on GENERIC_TIME
+ help
+ This option measures the time spent from a highprio thread being
+ woken up to it getting scheduled on a CPU, with microsecond
+ accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started via:
+
+ echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config LATENCY_TRACE
+ bool "Latency tracing"
+ default n
+ depends on LATENCY_TIMING && !REORDER && GENERIC_TIME
+ select FRAME_POINTER
+ select FUNCTION_TRACE
+ help
+ When this option is enabled then the last maximum latency timing
+ event's full trace can be found in /proc/latency_trace, in a
+ human-readable (or rather as some would say, in a
+ kernel-developer-readable) form.
+
+ (Note that kernel size and overhead increases noticeably
+ with this option enabled.)
+
+config CRITICAL_PREEMPT_TIMING
+ bool "Non-preemptible critical section latency timing"
+ default n
+ depends on PREEMPT
+ depends on GENERIC_TIME
+ help
+ This option measures the time spent in preempt-off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started via:
+
+ echo 0 > /proc/sys/kernel/preempt_max_latency
+
+ (Note that kernel size and overhead increases with this option
+ enabled. This option and the irqs-off timing option can be
+ used together or separately.)
+
+config CRITICAL_IRQSOFF_TIMING
+ bool "Interrupts-off critical section latency timing"
+ default n
+ depends on GENERIC_TIME
+ select TRACE_IRQFLAGS
+ help
+ This option measures the time spent in irqs-off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started via:
+
+ echo 0 > /proc/sys/kernel/preempt_max_latency
+
+ (Note that kernel size and overhead increases with this option
+ enabled. This option and the preempt-off timing option can be
+ used together or separately.)
+
+config WAKEUP_LATENCY_HIST
+ bool "wakeup latency histogram"
+ default n
+ depends on WAKEUP_TIMING
+ help
+ This option logs all the wakeup latency timing to a big histogram
+ bucket, in the meanwhile, it also dummies up printk produced by
+ wakeup latency timing.
+
+ The wakeup latency timing histogram can be viewed via:
+
+ cat /proc/latency_hist/wakeup_latency/CPU*
+
+ (Note: * presents CPU ID.)
+
+config PREEMPT_OFF_HIST
+ bool "non-preemptible critical section latency histogram"
+ default n
+ depends on CRITICAL_PREEMPT_TIMING
+ help
+ This option logs all the non-preemptible critical section latency
+ timing to a big histogram bucket, in the meanwhile, it also
+ dummies up printk produced by non-preemptible critical section
+ latency timing.
+
+ The non-preemptible critical section latency timing histogram can
+ be viewed via:
+
+ cat /proc/latency_hist/preempt_off_latency/CPU*
+
+ (Note: * presents CPU ID.)
+
+config INTERRUPT_OFF_HIST
+ bool "interrupts-off critical section latency histogram"
+ default n
+ depends on CRITICAL_IRQSOFF_TIMING
+ help
+ This option logs all the interrupts-off critical section latency
+ timing to a big histogram bucket, in the meanwhile, it also
+ dummies up printk produced by interrupts-off critical section
+ latency timing.
+
+ The interrupts-off critical section latency timing histogram can
+ be viewed via:
+
+ cat /proc/latency_hist/interrupt_off_latency/CPU*
+
+ (Note: * presents CPU ID.)
+
+config CRITICAL_TIMING
+ bool
+ default y
+ depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config DEBUG_TRACE_IRQFLAGS
+ bool
+ default y
+ depends on CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+ bool
+ default y
+ depends on WAKEUP_TIMING || CRITICAL_TIMING
+ select SYSCTL
+
+config CRITICAL_LATENCY_HIST
+ bool
+ default y
+ depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST
+
+config LATENCY_HIST
+ bool
+ default y
+ depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST
+
+config MCOUNT
+ bool
+ depends on FUNCTION_TRACE
+ default y
+
config DEBUG_KOBJECT
bool "kobject debugging"
depends on DEBUG_KERNEL
Index: linux-2.6.22-rc2/lib/debug_locks.c
===================================================================
--- linux-2.6.22-rc2.orig/lib/debug_locks.c 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/lib/debug_locks.c 2007-05-24 15:57:43.000000000 +0200
@@ -36,7 +36,16 @@ int debug_locks_silent;
int debug_locks_off(void)
{
if (xchg(&debug_locks, 0)) {
+#if 0
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ if (spin_is_locked(&current->pi_lock))
+ spin_unlock(&current->pi_lock);
+#endif
+#endif
if (!debug_locks_silent) {
+ stop_trace();
+ user_trace_stop();
+ printk("stopped custom tracer.\n");
console_verbose();
return 1;
}
Index: linux-2.6.22-rc2/scripts/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/scripts/Makefile 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/scripts/Makefile 2007-05-24 15:57:43.000000000 +0200
@@ -7,6 +7,7 @@
# conmakehash: Create chartable
# conmakehash: Create arrays for initializing the kernel console tables

+hostprogs-$(CONFIG_EVENT_TRACE) += trace-it
hostprogs-$(CONFIG_KALLSYMS) += kallsyms
hostprogs-$(CONFIG_LOGO) += pnmtologo
hostprogs-$(CONFIG_VT) += conmakehash
Index: linux-2.6.22-rc2/scripts/trace-it.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/scripts/trace-it.c 2007-05-24 15:57:43.000000000 +0200
@@ -0,0 +1,79 @@
+
+/*
+ * Copyright (C) 2005, Ingo Molnar <mingo@xxxxxxxxxx>
+ *
+ * user-triggered tracing.
+ *
+ * The -rt kernel has a built-in kernel tracer, which will trace
+ * all kernel function calls (and a couple of special events as well),
+ * by using a build-time gcc feature that instruments all kernel
+ * functions.
+ *
+ * The tracer is highly automated for a number of latency tracing purposes,
+ * but it can also be switched into 'user-triggered' mode, which is a
+ * half-automatic tracing mode where userspace apps start and stop the
+ * tracer. This file shows a dumb example how to turn user-triggered
+ * tracing on, and how to start/stop tracing. Note that if you do
+ * multiple start/stop sequences, the kernel will do a maximum search
+ * over their latencies, and will keep the trace of the largest latency
+ * in /proc/latency_trace. The maximums are also reported to the kernel
+ * log. (but can also be read from /proc/sys/kernel/preempt_max_latency)
+ *
+ * For the tracer to be activated, turn on CONFIG_EVENT_TRACING
+ * in the .config, rebuild the kernel and boot into it. The trace will
+ * get _alot_ more verbose if you also turn on CONFIG_FUNCTION_TRACING,
+ * every kernel function call will be put into the trace. Note that
+ * CONFIG_FUNCTION_TRACING has significant runtime overhead, so you dont
+ * want to use it for performance testing :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <linux/unistd.h>
+
+int main (int argc, char **argv)
+{
+ int ret;
+
+ if (getuid() != 0) {
+ fprintf(stderr, "needs to run as root.\n");
+ exit(1);
+ }
+ ret = system("cat /proc/sys/kernel/mcount_enabled >/dev/null 2>/dev/null");
+ if (ret) {
+ fprintf(stderr, "CONFIG_LATENCY_TRACING not enabled?\n");
+ exit(1);
+ }
+ system("echo 1 > /proc/sys/kernel/trace_user_triggered");
+ system("[ -e /proc/sys/kernel/wakeup_timing ] && echo 0 > /proc/sys/kernel/wakeup_timing");
+ system("echo 1 > /proc/sys/kernel/trace_enabled");
+ system("echo 1 > /proc/sys/kernel/mcount_enabled");
+ system("echo 0 > /proc/sys/kernel/trace_freerunning");
+ system("echo 0 > /proc/sys/kernel/trace_print_on_crash");
+ system("echo 0 > /proc/sys/kernel/trace_verbose");
+ system("echo 0 > /proc/sys/kernel/preempt_thresh 2>/dev/null");
+ system("echo 0 > /proc/sys/kernel/preempt_max_latency 2>/dev/null");
+
+ // start tracing
+ if (prctl(0, 1)) {
+ fprintf(stderr, "trace-it: couldnt start tracing!\n");
+ return 1;
+ }
+ usleep(10000000);
+ if (prctl(0, 0)) {
+ fprintf(stderr, "trace-it: couldnt stop tracing!\n");
+ return 1;
+ }
+
+ system("echo 0 > /proc/sys/kernel/trace_user_triggered");
+ system("echo 0 > /proc/sys/kernel/trace_enabled");
+ system("cat /proc/latency_trace");
+
+ return 0;
+}
+
+
Index: linux-2.6.22-rc2/arch/i386/boot/compressed/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/boot/compressed/Makefile 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/boot/compressed/Makefile 2007-05-24 15:57:42.000000000 +0200
@@ -9,6 +9,7 @@ targets := vmlinux vmlinux.bin vmlinux.
EXTRA_AFLAGS := -traditional

LDFLAGS_vmlinux := -T
+CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing
CFLAGS_misc.o += -fPIC
hostprogs-y := relocs

Index: linux-2.6.22-rc2/arch/i386/kernel/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/Makefile 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/Makefile 2007-05-24 15:57:42.000000000 +0200
@@ -21,6 +21,7 @@ obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o
obj-$(CONFIG_SMP) += smpcommon.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
+obj-$(CONFIG_MCOUNT) += mcount-wrapper.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
Index: linux-2.6.22-rc2/arch/i386/kernel/apic.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/apic.c 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/apic.c 2007-05-24 15:57:43.000000000 +0200
@@ -577,6 +577,8 @@ void fastcall smp_apic_timer_interrupt(s
{
struct pt_regs *old_regs = set_irq_regs(regs);

+ trace_special(regs->eip, 1, 0);
+
/*
* NOTE! We'd better ACK the irq immediately,
* because timer handling can be slow.
Index: linux-2.6.22-rc2/arch/i386/kernel/entry.S
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/entry.S 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/entry.S 2007-05-24 15:57:42.000000000 +0200
@@ -329,6 +329,11 @@ sysenter_past_esp:
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+#ifdef CONFIG_EVENT_TRACE
+ pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+ call sys_call
+ popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
GET_THREAD_INFO(%ebp)

/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -343,6 +348,11 @@ sysenter_past_esp:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
jne syscall_exit_work
+#ifdef CONFIG_EVENT_TRACE
+ pushl %eax
+ call sys_ret
+ popl %eax
+#endif
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
movl PT_OLDESP(%esp), %ecx
@@ -366,6 +376,11 @@ ENTRY(system_call)
pushl %eax # save orig_eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+#ifdef CONFIG_EVENT_TRACE
+ pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+ call sys_call
+ popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
GET_THREAD_INFO(%ebp)
testl $TF_MASK,PT_EFLAGS(%esp)
jz no_singlestep
Index: linux-2.6.22-rc2/arch/i386/kernel/hpet.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/hpet.c 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/hpet.c 2007-05-24 15:57:42.000000000 +0200
@@ -205,7 +205,7 @@ static int hpet_next_event(unsigned long
/*
* Clock source related code
*/
-static cycle_t read_hpet(void)
+static cycle_t notrace read_hpet(void)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
Index: linux-2.6.22-rc2/arch/i386/kernel/irq.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/irq.c 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/irq.c 2007-05-24 15:57:42.000000000 +0200
@@ -68,7 +68,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs;
/* high bit used in ret_from_ code */
@@ -87,6 +87,11 @@ fastcall unsigned int do_IRQ(struct pt_r

old_regs = set_irq_regs(regs);
irq_enter();
+#ifdef CONFIG_EVENT_TRACE
+ if (irq == trace_user_trigger_irq)
+ user_trace_start();
+#endif
+ trace_special(regs->eip, irq, 0);
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
{
Index: linux-2.6.22-rc2/arch/i386/kernel/mcount-wrapper.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/arch/i386/kernel/mcount-wrapper.S 2007-05-24 15:57:42.000000000 +0200
@@ -0,0 +1,27 @@
+/*
+ * linux/arch/i386/mcount-wrapper.S
+ *
+ * Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %ebp
+ mov %esp, %ebp
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ call __mcount
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ popl %ebp
+out:
+ ret
+
Index: linux-2.6.22-rc2/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/traps.c 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/traps.c 2007-05-24 15:57:42.000000000 +0200
@@ -136,7 +136,7 @@ static inline unsigned long print_contex

#define MSG(msg) ops->warning(data, msg)

-void dump_trace(struct task_struct *task, struct pt_regs *regs,
+void notrace dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack,
struct stacktrace_ops *ops, void *data)
{
@@ -222,6 +222,7 @@ show_trace_log_lvl(struct task_struct *t
{
dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
printk("%s =======================\n", log_lvl);
+ print_traces(task);
}

void show_trace(struct task_struct *task, struct pt_regs *regs,
Index: linux-2.6.22-rc2/arch/i386/kernel/tsc.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/tsc.c 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/tsc.c 2007-05-24 15:57:42.000000000 +0200
@@ -255,7 +255,7 @@ core_initcall(cpufreq_tsc);

static unsigned long current_tsc_khz = 0;

-static cycle_t read_tsc(void)
+static notrace cycle_t read_tsc(void)
{
cycle_t ret;

Index: linux-2.6.22-rc2/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/mm/fault.c 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/mm/fault.c 2007-05-24 15:57:42.000000000 +0200
@@ -483,6 +483,7 @@ bad_area_nosemaphore:
nr = (address - idt_descr.address) >> 3;

if (nr == 6) {
+ stop_trace();
do_invalid_op(regs, 0);
return;
}
Index: linux-2.6.22-rc2/arch/i386/mm/init.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/mm/init.c 2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/mm/init.c 2007-05-24 15:57:42.000000000 +0200
@@ -193,7 +193,7 @@ static inline int page_kills_ppro(unsign
return 0;
}

-int page_is_ram(unsigned long pagenr)
+int notrace page_is_ram(unsigned long pagenr)
{
int i;
unsigned long addr, end;
Index: linux-2.6.22-rc2/include/asm-i386/processor.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-i386/processor.h 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-i386/processor.h 2007-05-24 15:57:43.000000000 +0200
@@ -602,7 +602,9 @@ static inline void load_esp0(struct tss_
* clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
* resulting in stale register contents being returned.
*/
-static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
+static inline void
+cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
{
*eax = op;
*ecx = 0;
@@ -610,8 +612,9 @@ static inline void cpuid(unsigned int op
}

/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
- int *edx)
+static inline void
+cpuid_count(int op, int count, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
{
*eax = op;
*ecx = count;
Index: linux-2.6.22-rc2/arch/x86_64/ia32/ia32entry.S
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/ia32/ia32entry.S 2007-05-22 16:25:09.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/ia32/ia32entry.S 2007-05-24 15:57:42.000000000 +0200
@@ -120,7 +120,9 @@ sysenter_do_call:
cmpl $(IA32_NR_syscalls-1),%eax
ja ia32_badsys
IA32_ARG_FIXUP 1
+ TRACE_SYS_IA32_CALL
call *ia32_sys_call_table(,%rax,8)
+ TRACE_SYS_RET
movq %rax,RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%r10)
cli
@@ -229,7 +231,9 @@ cstar_do_call:
cmpl $IA32_NR_syscalls-1,%eax
ja ia32_badsys
IA32_ARG_FIXUP 1
+ TRACE_SYS_IA32_CALL
call *ia32_sys_call_table(,%rax,8)
+ TRACE_SYS_RET
movq %rax,RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%r10)
cli
@@ -323,8 +327,10 @@ ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax
ja ia32_badsys
IA32_ARG_FIXUP
+ TRACE_SYS_IA32_CALL
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
+ TRACE_SYS_RET
movq %rax,RAX-ARGOFFSET(%rsp)
jmp int_ret_from_sys_call

@@ -394,7 +400,7 @@ END(ia32_ptregs_common)

.section .rodata,"a"
.align 8
-ia32_sys_call_table:
+ENTRY(ia32_sys_call_table)
.quad sys_restart_syscall
.quad sys_exit
.quad stub32_fork
@@ -719,4 +725,7 @@ ia32_sys_call_table:
.quad compat_sys_signalfd
.quad compat_sys_timerfd
.quad sys_eventfd
+#ifdef CONFIG_EVENT_TRACE
+ .globl ia32_syscall_end
+#endif
ia32_syscall_end:
Index: linux-2.6.22-rc2/arch/x86_64/kernel/entry.S
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/entry.S 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/entry.S 2007-05-24 15:57:42.000000000 +0200
@@ -53,6 +53,47 @@

.code64

+#ifdef CONFIG_EVENT_TRACE
+
+ENTRY(mcount)
+ cmpl $0, mcount_enabled
+ jz out
+
+ push %rbp
+ mov %rsp,%rbp
+
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+ push %rdi
+ push %rsi
+ push %rdx
+ push %rcx
+ push %rax
+
+ mov 0x0(%rbp),%rax
+ mov 0x8(%rbp),%rdi
+ mov 0x8(%rax),%rsi
+
+ call __trace
+
+ pop %rax
+ pop %rcx
+ pop %rdx
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+
+ pop %rbp
+out:
+ ret
+
+#endif
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
@@ -234,7 +275,9 @@ ENTRY(system_call)
cmpq $__NR_syscall_max,%rax
ja badsys
movq %r10,%rcx
+ TRACE_SYS_CALL
call *sys_call_table(,%rax,8) # XXX: rip relative
+ TRACE_SYS_RET
movq %rax,RAX-ARGOFFSET(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
@@ -316,7 +359,9 @@ tracesys:
cmova %rcx,%rax
ja 1f
movq %r10,%rcx /* fixup for C */
+ TRACE_SYS_CALL
call *sys_call_table(,%rax,8)
+ TRACE_SYS_RET
1: movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */

Index: linux-2.6.22-rc2/arch/x86_64/kernel/head64.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/head64.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/head64.c 2007-05-24 15:57:42.000000000 +0200
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/percpu.h>
+#include <linux/sched.h>

#include <asm/processor.h>
#include <asm/proto.h>
@@ -58,7 +59,7 @@ static void __init copy_bootdata(char *r
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}

-void __init x86_64_start_kernel(char * real_mode_data)
+void __init notrace x86_64_start_kernel(char * real_mode_data)
{
int i;

Index: linux-2.6.22-rc2/arch/x86_64/kernel/irq.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/irq.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/irq.c 2007-05-24 15:57:42.000000000 +0200
@@ -115,6 +115,12 @@ asmlinkage unsigned int do_IRQ(struct pt
irq_enter();
irq = __get_cpu_var(vector_irq)[vector];

+#ifdef CONFIG_EVENT_TRACE
+ if (irq == trace_user_trigger_irq)
+ user_trace_start();
+#endif
+ trace_special(regs->rip, irq, 0);
+
#ifdef CONFIG_DEBUG_STACKOVERFLOW
stack_overflow_check(regs);
#endif
Index: linux-2.6.22-rc2/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/setup64.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/setup64.c 2007-05-24 15:57:42.000000000 +0200
@@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void)
}
}

-void pda_init(int cpu)
+void notrace pda_init(int cpu)
{
struct x8664_pda *pda = cpu_pda(cpu);

@@ -188,7 +188,7 @@ unsigned long kernel_eflags;
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init.
*/
-void __cpuinit cpu_init (void)
+void __cpuinit notrace cpu_init (void)
{
int cpu = stack_smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
Index: linux-2.6.22-rc2/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/smpboot.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/smpboot.c 2007-05-24 15:57:42.000000000 +0200
@@ -318,7 +318,7 @@ static inline void set_cpu_sibling_map(i
/*
* Setup code on secondary processor (after comming out of the trampoline)
*/
-void __cpuinit start_secondary(void)
+void __cpuinit notrace start_secondary(void)
{
/*
* Dont put anything before smp_callin(), SMP
Index: linux-2.6.22-rc2/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/traps.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/traps.c 2007-05-24 15:57:42.000000000 +0200
@@ -346,6 +346,7 @@ show_trace(struct task_struct *tsk, stru
printk("\nCall Trace:\n");
dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
printk("\n");
+ print_traces(tsk);
}

static void
Index: linux-2.6.22-rc2/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/vsyscall.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/vsyscall.c 2007-05-24 15:57:42.000000000 +0200
@@ -43,7 +43,7 @@
#include <asm/desc.h>
#include <asm/topology.h>

-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","rcx","memory"
#define __pa_vsymbol(x) \
({unsigned long v; \
Index: linux-2.6.22-rc2/include/asm-x86_64/calling.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-x86_64/calling.h 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-x86_64/calling.h 2007-05-24 15:57:43.000000000 +0200
@@ -160,3 +160,53 @@
.macro icebp
.byte 0xf1
.endm
+
+/*
+ * latency-tracing helpers:
+ */
+
+ .macro TRACE_SYS_CALL
+
+#ifdef CONFIG_EVENT_TRACE
+ SAVE_ARGS
+
+ mov %rdx, %rcx
+ mov %rsi, %rdx
+ mov %rdi, %rsi
+ mov %rax, %rdi
+
+ call sys_call
+
+ RESTORE_ARGS
+#endif
+ .endm
+
+
+ .macro TRACE_SYS_IA32_CALL
+
+#ifdef CONFIG_EVENT_TRACE
+ SAVE_ARGS
+
+ mov %rdx, %rcx
+ mov %rsi, %rdx
+ mov %rdi, %rsi
+ mov %rax, %rdi
+
+ call sys_ia32_call
+
+ RESTORE_ARGS
+#endif
+ .endm
+
+ .macro TRACE_SYS_RET
+
+#ifdef CONFIG_EVENT_TRACE
+ SAVE_ARGS
+
+ mov %rax, %rdi
+
+ call sys_ret
+
+ RESTORE_ARGS
+#endif
+ .endm
Index: linux-2.6.22-rc2/include/asm-x86_64/unistd.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-x86_64/unistd.h 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-x86_64/unistd.h 2007-05-24 15:57:43.000000000 +0200
@@ -11,6 +11,8 @@
* Note: holes are not allowed.
*/

+#define NR_syscalls (__NR_syscall_max+1)
+
/* at least 8 syscall per cacheline */
#define __NR_read 0
__SYSCALL(__NR_read, sys_read)
Index: linux-2.6.22-rc2/include/linux/prctl.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/prctl.h 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/prctl.h 2007-05-24 15:57:43.000000000 +0200
@@ -3,6 +3,7 @@

/* Values to pass as first argument to prctl() */

+#define PR_SET_TRACING 0 /* Second arg is tracing on/off */
#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */
#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */

Index: linux-2.6.22-rc2/kernel/sys.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/sys.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/sys.c 2007-05-24 15:57:42.000000000 +0200
@@ -2149,6 +2149,14 @@ asmlinkage long sys_prctl(int option, un
{
long error;

+#ifdef CONFIG_EVENT_TRACE
+ if (option == PR_SET_TRACING) {
+ if (arg2)
+ return user_trace_start();
+ return user_trace_stop();
+ }
+#endif
+
error = security_task_prctl(option, arg2, arg3, arg4, arg5);
if (error)
return error;
Index: linux-2.6.22-rc2/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/time/timekeeping.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/time/timekeeping.c 2007-05-24 15:57:42.000000000 +0200
@@ -195,6 +195,34 @@ static void change_clocksource(void)
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
}
+
+cycle_t notrace get_monotonic_cycles(void)
+{
+ cycle_t cycle_now, cycle_delta;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ return clock->cycle_last + cycle_delta;
+}
+
+unsigned long notrace cycles_to_usecs(cycle_t cycles)
+{
+ u64 ret = cyc2ns(clock, cycles);
+
+ do_div(ret, 1000);
+
+ return ret;
+}
+
+cycle_t notrace usecs_to_cycles(unsigned long usecs)
+{
+ return ns2cyc(clock, (u64)usecs * 1000);
+}
+
#else
static inline void change_clocksource(void) { }
#endif
Index: linux-2.6.22-rc2/arch/x86_64/kernel/stacktrace.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/stacktrace.c 2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/stacktrace.c 2007-05-24 15:57:42.000000000 +0200
@@ -24,7 +24,7 @@ static int save_stack_stack(void *data,
return -1;
}

-static void save_stack_address(void *data, unsigned long addr)
+static void notrace save_stack_address(void *data, unsigned long addr)
{
struct stack_trace *trace = (struct stack_trace *)data;
if (trace->skip > 0) {
Index: linux-2.6.22-rc2/kernel/hrtimer.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/hrtimer.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/hrtimer.c 2007-05-24 15:57:42.000000000 +0200
@@ -553,6 +553,8 @@ static inline int hrtimer_enqueue_reprog
return 0;
}

+void trace_start_ht_debug(void);
+
/*
* Switch to high resolution mode
*/
@@ -576,6 +578,9 @@ static int hrtimer_switch_to_hres(void)

tick_setup_sched_timer();

+ if (!cpu)
+ trace_start_ht_debug();
+
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
Index: linux-2.6.22-rc2/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/time/tick-sched.c 2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/time/tick-sched.c 2007-05-24 15:57:42.000000000 +0200
@@ -167,9 +167,21 @@ void tick_nohz_stop_sched_tick(void)
goto end;

cpu = smp_processor_id();
- if (unlikely(local_softirq_pending()))
- printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
- local_softirq_pending());
+ if (unlikely(local_softirq_pending())) {
+ static int ratelimit;
+
+ if (ratelimit < 10) {
+ if (!cpu) {
+ trace_special(0, 0, 0);
+ user_trace_stop();
+ ratelimit = 10;
+ }
+ printk(KERN_ERR
+ "NOHZ: local_softirq_pending %02x on CPU %d\n",
+ local_softirq_pending(), cpu);
+ ratelimit++;
+ }
+ }

now = ktime_get();
/*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/