[PATCH] ring-buffer: More precise time stamps for nested writes

From: Suresh E. Warrier
Date: Tue Mar 24 2015 - 19:10:19 EST


When tracing the behavior of multiple fio jobs running in parallel
our performance team observed that some scsi_dispatch_cmd_done events
appeared to occur earlier, often several microseconds earlier, than
their associated scsi_dispatch_cmd_start event in the trace records.
Other interrupt events were also observed to have earlier timestamps
than the events that caused them.

This incorrect chronological ordering of trace records occurs because
ALL nested writes have the same time stamp as the first writer, that is
the first writer in the stack that was preempted. In workloads where
interrupts occur frequently, the first writer can stay preempted across
multiple interrupts and nested trace events can record time stamps that
are many microseconds earlier than their actual value resulting in
the wrong ordering.

For example, a scsi_dispatch_cmd_start on CPU A occurs, say at time
t1, and the corresponding scsi_dispatch_cmd_done occurs on another
CPU B, say at time t2. However, CPU B is in the middle of a nested
write with the first writer having recorded its event at time t0, where
t0 < t1 < t2. Eventually, we get out of all the nested writes and
the first writer commits its trace event with a time of t0 and the
the nested scsi_dispatch_cmd_done gets assigned the same timestamp t0.
In the trace report, the scsi_dispatch_cmd_done thus appears to have
occurred before the scsi_dispatch_cmd_start.

In some cases, on larger systems with multiple fio jobs running and
interrupts being sent to a single CPU, we have noticed more than
400 trace events all with the same timestamp.

The reason all nested writes have the same time stamp is because
all nested trace events are assigned a time delta of 0. A correct
time delta cannot be computed for them because the first interrupted
write has not been committed yet and so the commit timestamp in the CPU
buffer does not record the time stamp of the just prior event.

One way of fixing this is to keep a timestamp of the last event we
reserved space for in the ring buffer, so that we can compute the
correct time delta for each event. We also need a short critical
section where we cannot be interrupted, in which we:
1. Read the current time
2. Compute the time delta for this event from the last time stamp
3. Update the value of the last time stamp
4. Reserve space in the ring buffer for the event,

Although the ring buffer design is coded to avoid disabling
interrupts, in this case there does not appear to be a practical way
to solve this problem without disabling interrupts for a short time.
To accommodate those architectures where disabling interrupts is
expensive, we introduce a new kernel configuration option called
CONFIG_RING_BUFFER_NESTED_WRITE_TIMESTAMP_UPDATE. On architectures
where local_irq_save/restore is cheap, this should be set to Y,
in order to get accurate timestamps. Other architectures can set it
to N to avoid the overhead of local_irq_save/restore at the cost of
getting inaccurate timestamps for nested writes.

Signed-off-by: Suresh Warrier <warrier@xxxxxxxxxxxxxxxxxx>
---
kernel/trace/Kconfig | 15 ++++++++++++
kernel/trace/ring_buffer.c | 59 +++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 015f85a..8a8e185 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -573,6 +573,21 @@ config RING_BUFFER_STARTUP_TEST

If unsure, say N

+config RING_BUFFER_NESTED_WRITE_TIMESTAMP_UPDATE
+ bool "Ring buffer update timestamps for nested writes"
+ depends on RING_BUFFER
+ default y if PPC64
+ help
+ This option allows nested writes to have updated time stamps
+ to correspond to the time that the event occurred instead of
+ having the same time stamp as the first write that was
+ interrupted.
+
+ The option requires a fast IRQ enable/disable mechanism
+ to work efficiently.
+
+ If unsure, say N
+
endif # FTRACE

endif # TRACING_SUPPORT
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3d9fee3..bb18d44 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -482,6 +482,7 @@ struct ring_buffer_per_cpu {
unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
+ u64 last_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
int nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -2008,9 +2009,11 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event, unsigned length,
int add_timestamp, u64 delta)
{
+#ifndef CONFIG_RING_BUFFER_NESTED_WRITE_TIMESTAMP_UPDATE
/* Only a commit updates the timestamp */
if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
delta = 0;
+#endif

/*
* If we need to add a timestamp, then we
@@ -2520,6 +2523,49 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
}
}

+#ifdef CONFIG_RING_BUFFER_NESTED_WRITE_TIMESTAMP_UPDATE
+/**
+ * get_write_timestamp:
+ * Must be called before we read the current time stamp
+ * It returns a pointer to the location of the last
+ * time stamp.
+ * put_write_timestamp:
+ * Must be called after we reserve space in the ring
+ * buffer. It saves the timestamp in the specified
+ * timestamp location (passed in as *pstamp).
+ */
+static u64 *get_write_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *flags)
+{
+ /*
+ * Ensure that we are not preempted until after we update
+ * the write timestamp.
+ */
+ local_irq_save(*flags);
+ return &cpu_buffer->last_stamp;
+}
+
+static void put_write_timestamp(u64 *pstamp, u64 ts, unsigned long *flags)
+{
+ /*
+ * Update time stamp to match timestamp of latest event on buffer
+ * before reenabling interrupts
+ */
+ *pstamp = ts;
+ local_irq_restore(*flags);
+}
+#else
+static u64 *get_write_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *flags)
+{
+ return &cpu_buffer->write_stamp;
+}
+
+static void put_write_timestamp(u64 *pstamp, u64 ts, unsigned long *flags)
+{
+}
+#endif
+
static struct ring_buffer_event *
rb_reserve_next_event(struct ring_buffer *buffer,
struct ring_buffer_per_cpu *cpu_buffer,
@@ -2530,6 +2576,8 @@ rb_reserve_next_event(struct ring_buffer *buffer,
int nr_loops = 0;
int add_timestamp;
u64 diff;
+ u64 *pstamp;
+ unsigned long flags;

rb_start_commit(cpu_buffer);

@@ -2565,14 +2613,16 @@ rb_reserve_next_event(struct ring_buffer *buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;

+ pstamp = get_write_timestamp(cpu_buffer, &flags);
+
ts = rb_time_stamp(cpu_buffer->buffer);
- diff = ts - cpu_buffer->write_stamp;
+ diff = ts - *pstamp;

/* make sure this diff is calculated here */
barrier();

/* Did the write stamp get updated already? */
- if (likely(ts >= cpu_buffer->write_stamp)) {
+ if (likely(ts >= *pstamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
int local_clock_stable = 1;
@@ -2583,7 +2633,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
(unsigned long long)delta,
(unsigned long long)ts,
- (unsigned long long)cpu_buffer->write_stamp,
+ (unsigned long long)*pstamp,
local_clock_stable ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
@@ -2594,6 +2644,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,

event = __rb_reserve_next(cpu_buffer, length, ts,
delta, add_timestamp);
+
+ put_write_timestamp(pstamp, ts, &flags);
+
if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;

--
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/