[PATCH v5 1/4] ftrace: Implement fs notification for tracing_max_latency

From: Viktor Rosendahl
Date: Tue Sep 03 2019 - 09:26:31 EST


This patch implements the feature that the tracing_max_latency file,
e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive
notifications through the fsnotify framework when a new latency is
available.

One particularly interesting use of this facility is when enabling
threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh,
together with the preempt/irqsoff tracers. This makes it possible to
implement a user space program that can, with equal probability,
obtain traces of latencies that occur immediately after each other in
spite of the fact that the preempt/irqsoff tracers operate in overwrite
mode.

This facility works with the hwlat, preempt/irqsoff, and wakeup
tracers.

This patch also adds some unfortunate calls from __schedule() and
do_idle(). Those calls to the latency_fsnotify_disable/enable() are
needed because we cannot wake up the workqueue from these critical
sections without risking a deadlock. Similar problems would also arise
if we try to schedule a tasklet, raise a softirq, or wake up a kernel
thread. If a notification event would happen in the forbidden sections,
we schedule the fsnotify work as soon as we have exited them.

There was a suggestion to remove this latency_fsnotify_enable/disable()
gunk, or at least to combine it with the start_critical_timings() and
stop_critical_timings(). I have however not been able to come up with
a way to do it.

It seems like it would be possible to simply replace the calls to
latency_fsnotify_enable/disable() with calls to
start/stop_critical_timings(). However, the main problem is that it
would not work for the wakup tracer. The wakeup tracer needs a
facility that postpones the notifications, not one that prevents the
measurements because all its measurements takes place in the middle
of __schedule(). On the other hand, in some places, like in idle and
the console we need start stop functions that prevents the
measurements from being make.

Signed-off-by: Viktor Rosendahl <viktor.rosendahl@xxxxxxxxx>
---
include/linux/ftrace.h | 31 +++++++++
kernel/sched/core.c | 3 +
kernel/sched/idle.c | 3 +
kernel/sched/sched.h | 1 +
kernel/trace/trace.c | 112 +++++++++++++++++++++++++++++-
kernel/trace/trace.h | 22 ++++++
kernel/trace/trace_hwlat.c | 4 +-
kernel/trace/trace_irqsoff.c | 4 ++
kernel/trace/trace_sched_wakeup.c | 4 ++
9 files changed, 181 insertions(+), 3 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8a8cb3c401b2..b4d9700ef917 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -907,4 +907,35 @@ unsigned long arch_syscall_addr(int nr);

#endif /* CONFIG_FTRACE_SYSCALLS */

+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+DECLARE_PER_CPU(int, latency_notify_disable);
+DECLARE_STATIC_KEY_FALSE(latency_notify_key);
+
+void latency_fsnotify_process(void);
+
+/*
+ * Disable/enable fsnotify while in scheduler and idle code. Trying to wake
+ * anything up from there, such as calling queue_work() is prone to deadlock.
+ */
+static inline void latency_fsnotify_disable(void)
+{
+ this_cpu_inc(latency_notify_disable);
+}
+
+static inline void latency_fsnotify_enable(void)
+{
+ this_cpu_dec(latency_notify_disable);
+ if (static_branch_unlikely(&latency_notify_key))
+ latency_fsnotify_process();
+}
+
+#else
+
+#define latency_fsnotify_disable() do { } while (0)
+#define latency_fsnotify_enable() do { } while (0)
+
+#endif
+
#endif /* _LINUX_FTRACE_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 010d578118d6..e3c1dc801073 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3198,6 +3198,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/

rq = finish_task_switch(prev);
+ latency_fsnotify_enable();
balance_callback(rq);
preempt_enable();

@@ -3820,6 +3821,7 @@ static void __sched notrace __schedule(bool preempt)

local_irq_disable();
rcu_note_context_switch(preempt);
+ latency_fsnotify_disable();

/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -3883,6 +3885,7 @@ static void __sched notrace __schedule(bool preempt)
rq_unlock_irq(rq, &rf);
}

+ latency_fsnotify_enable();
balance_callback(rq);
}

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..5fc87d99a407 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -236,6 +236,7 @@ static void do_idle(void)

__current_set_polling();
tick_nohz_idle_enter();
+ latency_fsnotify_disable();

while (!need_resched()) {
check_pgt_cache();
@@ -265,6 +266,8 @@ static void do_idle(void)
arch_cpu_idle_exit();
}

+ latency_fsnotify_enable();
+
/*
* Since we fell out of the loop above, we know TIF_NEED_RESCHED must
* be set, propagate it into PREEMPT_NEED_RESCHED.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..467d6ad03f16 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -46,6 +46,7 @@
#include <linux/debugfs.h>
#include <linux/delayacct.h>
#include <linux/energy_model.h>
+#include <linux/ftrace.h>
#include <linux/init_task.h>
#include <linux/kprobes.h>
#include <linux/kthread.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 563e80f9006a..a622263a69e4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,6 +44,10 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
+#include <linux/fsnotify.h>
+#include <linux/workqueue.h>
+#include <trace/events/power.h>
+#include <trace/events/sched.h>

#include "trace.h"
#include "trace_output.h"
@@ -1480,6 +1484,110 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)

unsigned long __read_mostly tracing_thresh;

+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+static const struct file_operations tracing_max_lat_fops;
+static struct workqueue_struct *fsnotify_wq;
+static DEFINE_PER_CPU(struct llist_head, notify_list);
+
+DEFINE_PER_CPU(int, latency_notify_disable);
+DEFINE_STATIC_KEY_FALSE(latency_notify_key);
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
+ tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+}
+
+static void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ atomic_set(&tr->notify_pending, 0);
+ tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+ d_tracer, &tr->max_latency,
+ &tracing_max_lat_fops);
+}
+
+void latency_fsnotify_stop(void)
+{
+ /* Make sure all CPUs see caller's previous actions to stop tracer */
+ smp_wmb();
+ static_branch_disable(&latency_notify_key);
+ latency_fsnotify_process();
+}
+
+void latency_fsnotify_start(void)
+{
+ static_branch_enable(&latency_notify_key);
+ /* Make sure all CPUs see key value before caller continue */
+ smp_wmb();
+}
+
+void latency_fsnotify_process(void)
+{
+ struct trace_array *tr;
+ struct llist_head *list;
+ struct llist_node *node;
+
+ if (this_cpu_read(latency_notify_disable))
+ return;
+
+ list = this_cpu_ptr(&notify_list);
+ for (node = llist_del_first(list); node != NULL;
+ node = llist_del_first(list)) {
+ tr = llist_entry(node, struct trace_array, notify_ll);
+ atomic_set(&tr->notify_pending, 0);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+ }
+}
+
+__init static int latency_fsnotify_init(void)
+{
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!fsnotify_wq) {
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+ if (!fsnotify_wq)
+ return;
+
+ if (!this_cpu_read(latency_notify_disable))
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+ else {
+ /*
+ * notify_pending prevents us from adding the same entry to
+ * more than one notify_list. It will get queued in
+ * latency_enable_fsnotify()
+ */
+ if (!atomic_xchg(&tr->notify_pending, 1))
+ llist_add(&tr->notify_ll, this_cpu_ptr(&notify_list));
+ }
+}
+
+/*
+ * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ * defined(CONFIG_FSNOTIFY)
+ */
+#else
+
+#define trace_create_maxlat_file(tr, d_tracer) \
+ trace_create_file("tracing_max_latency", 0644, d_tracer, \
+ &tr->max_latency, &tracing_max_lat_fops)
+
+#endif
+
#ifdef CONFIG_TRACER_MAX_TRACE
/*
* Copy the new maximum trace into the separate maximum-trace
@@ -1518,6 +1626,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)

/* record this tasks comm */
tracing_record_cmdline(tsk);
+ latency_fsnotify(tr);
}

/**
@@ -8550,8 +8659,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);

#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
- trace_create_file("tracing_max_latency", 0644, d_tracer,
- &tr->max_latency, &tracing_max_lat_fops);
+ trace_create_maxlat_file(tr, d_tracer);
#endif

if (ftrace_create_function_files(tr, d_tracer))
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..d9f83b2aaa71 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -16,6 +16,7 @@
#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/glob.h>
+#include <linux/workqueue.h>

#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -264,6 +265,12 @@ struct trace_array {
#endif
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
unsigned long max_latency;
+#ifdef CONFIG_FSNOTIFY
+ struct dentry *d_max_latency;
+ struct work_struct fsnotify_work;
+ atomic_t notify_pending;
+ struct llist_node notify_ll;
+#endif
#endif
struct trace_pid_list __rcu *filtered_pids;
/*
@@ -785,6 +792,21 @@ void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */

+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+void latency_fsnotify(struct trace_array *tr);
+void latency_fsnotify_start(void);
+void latency_fsnotify_stop(void);
+
+#else
+
+#define latency_fsnotify(tr) do { } while (0)
+#define latency_fsnotify_start() do { } while (0)
+#define latency_fsnotify_stop() do { } while (0)
+
+#endif
+
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index fa95139445b2..9c379261ee89 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -254,8 +254,10 @@ static int get_sample(void)
trace_hwlat_sample(&s);

/* Keep a running maximum ever recorded hardware latency */
- if (sample > tr->max_latency)
+ if (sample > tr->max_latency) {
tr->max_latency = sample;
+ latency_fsnotify(tr);
+ }
}

out:
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a745b0cee5d3..29403a83a5f0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -557,6 +557,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
if (irqsoff_busy)
return -EBUSY;

+ latency_fsnotify_start();
save_flags = tr->trace_flags;

/* non overwrite screws up the latency tracers */
@@ -591,16 +592,19 @@ static void __irqsoff_tracer_reset(struct trace_array *tr)
ftrace_reset_array_ops(tr);

irqsoff_busy = false;
+ latency_fsnotify_stop();
}

static void irqsoff_tracer_start(struct trace_array *tr)
{
+ latency_fsnotify_start();
tracer_enabled = 1;
}

static void irqsoff_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
+ latency_fsnotify_stop();
}

#ifdef CONFIG_IRQSOFF_TRACER
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 743b2b520d34..3dc90d9f605b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -669,6 +669,7 @@ static bool wakeup_busy;

static int __wakeup_tracer_init(struct trace_array *tr)
{
+ latency_fsnotify_start();
save_flags = tr->trace_flags;

/* non overwrite screws up the latency tracers */
@@ -727,10 +728,12 @@ static void wakeup_tracer_reset(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
ftrace_reset_array_ops(tr);
wakeup_busy = false;
+ latency_fsnotify_stop();
}

static void wakeup_tracer_start(struct trace_array *tr)
{
+ latency_fsnotify_start();
wakeup_reset(tr);
tracer_enabled = 1;
}
@@ -738,6 +741,7 @@ static void wakeup_tracer_start(struct trace_array *tr)
static void wakeup_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
+ latency_fsnotify_stop();
}

static struct tracer wakeup_tracer __read_mostly =
--
2.17.1