[for-next][PATCH 5/8] rtla/timerlat: Add auto-analysis support to timerlat top

From: Steven Rostedt
Date: Sat Feb 18 2023 - 15:19:53 EST


From: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>

Currently, timerlat top displays the timerlat tracer latency results, saving
the intuitive timerlat trace for the developer to analyze.

This patch goes a step forward in the automaton of the scheduling latency
analysis by providing a summary of the root cause of a latency higher than
the passed "stop tracing" parameter if the trace stops.

The output is intuitive enough for non-expert users to have a general idea
of the root cause by looking at each factor's contribution percentage while
keeping the technical detail in the output for more expert users to start
an in dept debug or to correlate a root cause with an existing one.

The terminology is in line with recent industry and academic publications
to facilitate the understanding of both audiences.

Here is one example of tool output:
----------------------------------------- %< -----------------------------------------------------
# taskset -c 0 timerlat -a 40 -c 1-23 -q
Timer Latency
0 00:00:12 | IRQ Timer Latency (us) | Thread Timer Latency (us)
CPU COUNT | cur min avg max | cur min avg max
1 #12322 | 0 0 1 15 | 10 3 9 31
2 #12322 | 3 0 1 12 | 10 3 9 23
3 #12322 | 1 0 1 21 | 8 2 8 34
4 #12322 | 1 0 1 17 | 10 2 11 33
5 #12322 | 0 0 1 12 | 8 3 8 25
6 #12322 | 1 0 1 14 | 16 3 11 35
7 #12322 | 0 0 1 14 | 9 2 8 29
8 #12322 | 1 0 1 22 | 9 3 9 34
9 #12322 | 0 0 1 14 | 8 2 8 24
10 #12322 | 1 0 0 12 | 9 3 8 24
11 #12322 | 0 0 0 15 | 6 2 7 29
12 #12321 | 1 0 0 13 | 5 3 8 23
13 #12319 | 0 0 1 14 | 9 3 9 26
14 #12321 | 1 0 0 13 | 6 2 8 24
15 #12321 | 1 0 1 15 | 12 3 11 27
16 #12318 | 0 0 1 13 | 7 3 10 24
17 #12319 | 0 0 1 13 | 11 3 9 25
18 #12318 | 0 0 0 12 | 8 2 8 20
19 #12319 | 0 0 1 18 | 10 2 9 28
20 #12317 | 0 0 0 20 | 9 3 8 34
21 #12318 | 0 0 0 13 | 8 3 8 28
22 #12319 | 0 0 1 11 | 8 3 10 22
23 #12320 | 28 0 1 28 | 41 3 11 41
rtla timerlat hit stop tracing
## CPU 23 hit stop tracing, analyzing it ##
IRQ handler delay: 27.49 us (65.52 %)
IRQ latency: 28.13 us
Timerlat IRQ duration: 9.59 us (22.85 %)
Blocking thread: 3.79 us (9.03 %)
objtool:49256 3.79 us
Blocking thread stacktrace
-> timerlat_irq
-> __hrtimer_run_queues
-> hrtimer_interrupt
-> __sysvec_apic_timer_interrupt
-> sysvec_apic_timer_interrupt
-> asm_sysvec_apic_timer_interrupt
-> _raw_spin_unlock_irqrestore
-> cgroup_rstat_flush_locked
-> cgroup_rstat_flush_irqsafe
-> mem_cgroup_flush_stats
-> mem_cgroup_wb_stats
-> balance_dirty_pages
-> balance_dirty_pages_ratelimited_flags
-> btrfs_buffered_write
-> btrfs_do_write_iter
-> vfs_write
-> __x64_sys_pwrite64
-> do_syscall_64
-> entry_SYSCALL_64_after_hwframe
------------------------------------------------------------------------
Thread latency: 41.96 us (100%)

The system has exit from idle latency!
Max timerlat IRQ latency from idle: 17.48 us in cpu 4
Saving trace to timerlat_trace.txt
----------------------------------------- >% -----------------------------------------------------

In this case, the major factor was the delay suffered by the IRQ handler
that handles timerlat wakeup: 65.52 %. This can be caused by the
current thread masking interrupts, which can be seen in the blocking
thread stacktrace: the current thread (objtool:49256) disabled interrupts
via raw spin lock operations inside mem cgroup, while doing write
syscall in a btrfs file system.

A simple search for the function name on Google shows that this is
a legit case for disabling the interrupts:

cgroup: Use irqsave in cgroup_rstat_flush_locked()
lore.kernel.org/linux-mm/20220301122143.1521823-2-bigeasy@xxxxxxxxxxxxx/

The output also prints other reasons for the latency root cause, such as:

- an IRQ that happened before the IRQ handler that caused delays
- The interference from NMI, IRQ, Softirq, and Threads

The details about how these factors affect the scheduling latency
can be found here:

https://bristot.me/demystifying-the-real-time-linux-latency/

Link: https://lkml.kernel.org/r/3d45f40e630317f51ac6d678e2d96d310e495729.1675179318.git.bristot@xxxxxxxxxx

Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Jonathan Corbet <corbet@xxxxxxx>
Signed-off-by: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Signed-off-by: Steven Rostedt (Google) <rostedt@xxxxxxxxxxx>
---
tools/tracing/rtla/src/timerlat_top.c | 46 +++++++++++++++++++++++++--
1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 334271935222..eea5b3357e27 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -10,10 +10,12 @@
#include <unistd.h>
#include <stdio.h>
#include <time.h>
+#include <errno.h>

#include "utils.h"
#include "osnoise.h"
#include "timerlat.h"
+#include "timerlat_aa.h"

struct timerlat_top_params {
char *cpus;
@@ -30,6 +32,8 @@ struct timerlat_top_params {
int quiet;
int set_sched;
int dma_latency;
+ int no_aa;
+ int dump_tasks;
struct sched_attr sched_param;
struct trace_events *events;
};
@@ -130,17 +134,22 @@ timerlat_top_handler(struct trace_seq *s, struct tep_record *record,
struct tep_event *event, void *context)
{
struct trace_instance *trace = context;
+ struct timerlat_top_params *params;
unsigned long long latency, thread;
struct osnoise_tool *top;
int cpu = record->cpu;

top = container_of(trace, struct osnoise_tool, trace);
+ params = top->params;

tep_get_field_val(s, event, "context", record, &thread, 1);
tep_get_field_val(s, event, "timer_latency", record, &latency, 1);

timerlat_top_update(top, cpu, thread, latency);

+ if (!params->no_aa)
+ timerlat_aa_handler(s, record, event, context);
+
return 0;
}

@@ -281,11 +290,13 @@ static void timerlat_top_usage(char *usage)
" -c/--cpus cpus: run the tracer only on the given cpus",
" -d/--duration time[m|h|d]: duration of the session in seconds",
" -D/--debug: print debug info",
+ " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)",
" -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]",
" -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
" --filter <command>: enable a trace event filter to the previous -e event",
" --trigger <command>: enable a trace event trigger to the previous -e event",
" -n/--nano: display data in nanoseconds",
+ " --no-aa: disable auto-analysis, reducing rtla timerlat cpu usage",
" -q/--quiet print only a summary at the end",
" --dma-latency us: set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency",
" -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
@@ -349,13 +360,15 @@ static struct timerlat_top_params
{"trigger", required_argument, 0, '0'},
{"filter", required_argument, 0, '1'},
{"dma-latency", required_argument, 0, '2'},
+ {"no-aa", no_argument, 0, '3'},
+ {"dump-tasks", no_argument, 0, '4'},
{0, 0, 0, 0}
};

/* getopt_long stores the option index here. */
int option_index = 0;

- c = getopt_long(argc, argv, "a:c:d:De:hi:np:P:qs:t::T:0:1:2:",
+ c = getopt_long(argc, argv, "a:c:d:De:hi:np:P:qs:t::T:0:1:2:34",
long_options, &option_index);

/* detect the end of the options. */
@@ -368,13 +381,13 @@ static struct timerlat_top_params

/* set thread stop to auto_thresh */
params->stop_total_us = auto_thresh;
+ params->stop_us = auto_thresh;

/* get stack trace */
params->print_stack = auto_thresh;

/* set trace */
params->trace_output = "timerlat_trace.txt";
-
break;
case 'c':
retval = parse_cpu_list(optarg, &params->monitored_cpus);
@@ -437,6 +450,7 @@ static struct timerlat_top_params
params->trace_output = &optarg[1];
else
params->trace_output = "timerlat_trace.txt";
+
break;
case '0': /* trigger */
if (params->events) {
@@ -467,6 +481,12 @@ static struct timerlat_top_params
exit(EXIT_FAILURE);
}
break;
+ case '3': /* no-aa */
+ params->no_aa = 1;
+ break;
+ case '4':
+ params->dump_tasks = 1;
+ break;
default:
timerlat_top_usage("Invalid option");
}
@@ -477,6 +497,12 @@ static struct timerlat_top_params
exit(EXIT_FAILURE);
}

+ /*
+ * Auto analysis only happens if stop tracing, thus:
+ */
+ if (!params->stop_us && !params->stop_total_us)
+ params->no_aa = 1;
+
return params;
}

@@ -547,6 +573,7 @@ static struct osnoise_tool
{
struct osnoise_tool *top;
int nr_cpus;
+ int retval;

nr_cpus = sysconf(_SC_NPROCESSORS_CONF);

@@ -563,6 +590,16 @@ static struct osnoise_tool
tep_register_event_handler(top->trace.tep, -1, "ftrace", "timerlat",
timerlat_top_handler, top);

+ /*
+ * If no auto analysis, we are ready.
+ */
+ if (params->no_aa)
+ return top;
+
+ retval = timerlat_aa_init(top, nr_cpus, params->dump_tasks);
+ if (retval)
+ goto out_err;
+
return top;

out_err:
@@ -688,6 +725,10 @@ int timerlat_top_main(int argc, char *argv[])

if (trace_is_off(&top->trace, &record->trace)) {
printf("rtla timerlat hit stop tracing\n");
+
+ if (!params->no_aa)
+ timerlat_auto_analysis(params->stop_us, params->stop_total_us);
+
if (params->trace_output) {
printf(" Saving trace to %s\n", params->trace_output);
save_trace_to_file(record->trace.inst, params->trace_output);
@@ -701,6 +742,7 @@ int timerlat_top_main(int argc, char *argv[])
params->events = NULL;
out_free:
timerlat_free_top(top->data);
+ timerlat_aa_destroy();
osnoise_destroy_tool(record);
osnoise_destroy_tool(top);
free(params);
--
2.39.1