[PATCH v4 1/4] ftrace: Implement fs notification for tracing_max_latency

From: Viktor Rosendahl
Date: Fri May 17 2019 - 16:36:50 EST


This patch implements the feature that the tracing_max_latency file,
e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive
notifications through the fsnotify framework when a new latency is
available.

One particularly interesting use of this facility is when enabling
threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh,
together with the preempt/irqsoff tracers. This makes it possible to
implement a user space program that can, with equal probability,
obtain traces of latencies that occur immediately after each other in
spite of the fact that the preempt/irqsoff tracers operate in overwrite
mode.

This facility works with the hwlat, preempt/irqsoff, and wakeup
tracers.

This patch also add four new tracepoints, for entering/exiting
__schedule() and do_idle(). Those tracepoints are needed to disable
and enable the notificaton facility because we cannot wake up the
workqueue from these critical sections without risking a deadlock.
Similar problems would also arise if we try to schedule a tasklet,
raise a softirq, or wake up a kernel thread.

If a notification event would happen in the forbidden sections, we
schedule the fsnotify work as soon as we have exited
do_idle()/__schedule().

Signed-off-by: Viktor Rosendahl <viktor.rosendahl@xxxxxxxxx>
---
include/trace/events/power.h | 40 +++++++++
include/trace/events/sched.h | 40 +++++++++
kernel/sched/core.c | 2 +
kernel/sched/idle.c | 2 +
kernel/trace/trace.c | 168 ++++++++++++++++++++++++++++++++++-
kernel/trace/trace.h | 12 +++
kernel/trace/trace_hwlat.c | 4 +-
7 files changed, 265 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index f7aece721aed..40ab747274bd 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -40,6 +40,46 @@ DEFINE_EVENT(cpu, cpu_idle,
TP_ARGS(state, cpu_id)
);

+/*
+ * Tracepoint for entering do_idle():
+ */
+TRACE_EVENT(do_idle_enter,
+
+ TP_PROTO(int cpu),
+
+ TP_ARGS(cpu),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ ),
+
+ TP_printk("cpu=%d", __entry->cpu)
+);
+
+/*
+ * Tracepoint for exiting do_idle():
+ */
+TRACE_EVENT(do_idle_exit,
+
+ TP_PROTO(int cpu),
+
+ TP_ARGS(cpu),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ ),
+
+ TP_printk("cpu=%d", __entry->cpu)
+);
+
TRACE_EVENT(powernv_throttle,

TP_PROTO(int chip_id, const char *reason, int pmax),
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index c8c7c7efb487..a1a1befea1c1 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -183,6 +183,46 @@ TRACE_EVENT(sched_switch,
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);

+/*
+ * Tracepoint for entering __schedule():
+ */
+TRACE_EVENT(sched_schedule_enter,
+
+ TP_PROTO(int cpu),
+
+ TP_ARGS(cpu),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ ),
+
+ TP_printk("cpu=%d", __entry->cpu)
+);
+
+/*
+ * Tracepoint for exiting __schedule():
+ */
+TRACE_EVENT(sched_schedule_exit,
+
+ TP_PROTO(int cpu),
+
+ TP_ARGS(cpu),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ ),
+
+ TP_printk("cpu=%d", __entry->cpu)
+);
+
/*
* Tracepoint for a task being migrated:
*/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 102dfcf0a29a..c9d86fcc48f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3374,6 +3374,7 @@ static void __sched notrace __schedule(bool preempt)
int cpu;

cpu = smp_processor_id();
+ trace_sched_schedule_enter(cpu);
rq = cpu_rq(cpu);
prev = rq->curr;

@@ -3448,6 +3449,7 @@ static void __sched notrace __schedule(bool preempt)
}

balance_callback(rq);
+ trace_sched_schedule_exit(cpu);
}

void __noreturn do_task_dead(void)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..e328e045c6e8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -224,6 +224,7 @@ static void cpuidle_idle_call(void)
static void do_idle(void)
{
int cpu = smp_processor_id();
+ trace_do_idle_enter(cpu);
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -287,6 +288,7 @@ static void do_idle(void)

if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
+ trace_do_idle_exit(cpu);
}

bool cpu_in_idle(unsigned long pc)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c92b3d9ea30..72ac53a47b1e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,6 +44,10 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
+#include <linux/fsnotify.h>
+#include <linux/workqueue.h>
+#include <trace/events/power.h>
+#include <trace/events/sched.h>

#include "trace.h"
#include "trace_output.h"
@@ -1481,6 +1485,166 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)

unsigned long __read_mostly tracing_thresh;

+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+static const struct file_operations tracing_max_lat_fops;
+static struct workqueue_struct *fsnotify_wq;
+static DEFINE_PER_CPU(unsigned int, notify_disabled) = 1;
+static DEFINE_PER_CPU(struct llist_head, notify_list);
+
+static void trace_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
+ tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+}
+
+static void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ INIT_WORK(&tr->fsnotify_work, trace_fsnotify_workfn);
+ atomic_set(&tr->notify_pending, 0);
+ tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+ d_tracer, &tr->max_latency,
+ &tracing_max_lat_fops);
+}
+
+/*
+ * Disable fsnotify while in scheduler and idle code. Trying wake anything up
+ * from there, such as calling queue_work() is prone to deadlock.
+ */
+static inline void notrace trace_disable_fsnotify(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ per_cpu(notify_disabled, cpu) = 1;
+ raw_local_irq_restore(flags);
+}
+
+static inline void notrace trace_enable_fsnotify(void)
+{
+ int cpu;
+ struct trace_array *tr;
+ struct llist_head *list;
+ struct llist_node *node;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ per_cpu(notify_disabled, cpu) = 0;
+ raw_local_irq_restore(flags);
+
+ if (!per_cpu(notify_disabled, cpu)) {
+ list = &per_cpu(notify_list, cpu);
+ for (node = llist_del_first(list); node != NULL;
+ node = llist_del_first(list)) {
+ tr = llist_entry(node, struct trace_array, notify_ll);
+ atomic_set(&tr->notify_pending, 0);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+ }
+ }
+}
+
+static void notrace probe_sched_enter(void *nihil, int cpu)
+{
+ trace_disable_fsnotify();
+}
+
+static void notrace probe_sched_exit(void *nihil, int cpu)
+{
+ trace_enable_fsnotify();
+}
+
+__init static int trace_maxlat_fsnotify_init(void)
+{
+ int ret;
+
+ ret = register_trace_sched_schedule_enter(probe_sched_enter, NULL);
+ if (ret) {
+ pr_info("tracing_max_latency: Could not activate tracepoint probe to sched_schedule_enter\n");
+ return ret;
+ }
+
+ ret = register_trace_sched_schedule_exit(probe_sched_exit, NULL);
+ if (ret) {
+ pr_info("tracing_max_latency: Could not activate tracepoint probe to sched_schedule_exit\n");
+ goto fail_deprobe_sched_enter;
+ }
+
+ ret = register_trace_do_idle_enter(probe_sched_enter, NULL);
+ if (ret) {
+ pr_info("tracing_max_latency: Could not activate tracepoint probe to do_idle_enter\n");
+ goto fail_deprobe_sched_exit;
+ }
+
+ ret = register_trace_do_idle_exit(probe_sched_exit, NULL);
+ if (ret) {
+ pr_info("tracing_max_latency: Could not activate tracepoint probe to do_idle_exit\n");
+ goto fail_deprobe_idle_enter;
+ }
+
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (fsnotify_wq)
+ return 0;
+
+ ret = -ENOMEM;
+
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+
+ unregister_trace_do_idle_exit(probe_sched_exit, NULL);
+
+fail_deprobe_idle_enter:
+ unregister_trace_do_idle_enter(probe_sched_enter, NULL);
+
+fail_deprobe_sched_exit:
+ unregister_trace_sched_schedule_exit(probe_sched_exit, NULL);
+
+fail_deprobe_sched_enter:
+ unregister_trace_sched_schedule_enter(probe_sched_enter, NULL);
+ return ret;
+}
+
+late_initcall_sync(trace_maxlat_fsnotify_init);
+
+void trace_maxlat_fsnotify(struct trace_array *tr)
+{
+ int cpu;
+
+ if (!fsnotify_wq)
+ return;
+
+ cpu = smp_processor_id();
+ if (!per_cpu(notify_disabled, cpu))
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+ else {
+ /*
+ * notify_pending prevents us from adding the same entry to
+ * more than one notify_list. It will get queued in
+ * trace_enable_fsnotify()
+ */
+ if (!atomic_xchg(&tr->notify_pending, 1))
+ llist_add(&tr->notify_ll, &per_cpu(notify_list, cpu));
+ }
+}
+
+/*
+ * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ * defined(CONFIG_FSNOTIFY)
+ */
+#else
+
+#define trace_create_maxlat_file(tr, d_tracer) \
+ trace_create_file("tracing_max_latency", 0644, d_tracer, \
+ &tr->max_latency, &tracing_max_lat_fops)
+
+#endif
+
#ifdef CONFIG_TRACER_MAX_TRACE
/*
* Copy the new maximum trace into the separate maximum-trace
@@ -1519,6 +1683,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)

/* record this tasks comm */
tracing_record_cmdline(tsk);
+ trace_maxlat_fsnotify(tr);
}

/**
@@ -8533,8 +8698,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);

#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
- trace_create_file("tracing_max_latency", 0644, d_tracer,
- &tr->max_latency, &tracing_max_lat_fops);
+ trace_create_maxlat_file(tr, d_tracer);
#endif

if (ftrace_create_function_files(tr, d_tracer))
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1974ce818ddb..f95027813296 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -17,6 +17,7 @@
#include <linux/compiler.h>
#include <linux/trace_seq.h>
#include <linux/glob.h>
+#include <linux/workqueue.h>

#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -265,6 +266,12 @@ struct trace_array {
#endif
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
unsigned long max_latency;
+#ifdef CONFIG_FSNOTIFY
+ struct dentry *d_max_latency;
+ struct work_struct fsnotify_work;
+ atomic_t notify_pending;
+ struct llist_node notify_ll;
+#endif
#endif
struct trace_pid_list __rcu *filtered_pids;
/*
@@ -786,6 +793,11 @@ void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */

+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+void trace_maxlat_fsnotify(struct trace_array *tr);
+#endif
+
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..44a47f81d949 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -254,8 +254,10 @@ static int get_sample(void)
trace_hwlat_sample(&s);

/* Keep a running maximum ever recorded hardware latency */
- if (sample > tr->max_latency)
+ if (sample > tr->max_latency) {
tr->max_latency = sample;
+ trace_maxlat_fsnotify(tr);
+ }
}

out:
--
2.17.1