[PATCH] perf record: Add snapshot mode support for perf's regular events

From: Yunlong Song
Date: Tue Nov 24 2015 - 08:56:00 EST


For aux area tracing, there is already a snapshot mode support for
intel-pt and intel-bts events. Similarly, this patch adds a snapshot
mode for perf's regular events. A user space ring buffer is allocated to
handle the tracing data from the kernel space ring buffer, and the
tracing data will only dump to perf.data when perf receives a SIGUSR2
signal.

Similarly like '-S' in aux trace snapshot mode, '-M' enables perf's
regular event's snapshot mode by defining the size (bytes) of the user
space ring buffer.

Example 1:

$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and do not
* send any SIGUSR2 signal to perf during perf's running.
*/

$ perf report

Error:
The perf.data file has no samples!
# To display the perf.data header info, please use --header/--header-only options.

As shown above, without any SIGUSR2 signal, perf record will dump no samples
to perf.data in the snapshot mode.

Example 2:

$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and send
* several times of SIGUSR2 signal to perf during perf's running.
*/

# kill -SIGUSR2 `pidof perf`
...
# kill -SIGUSR2 `pidof perf`

$ perf report
<SNIP>
# Total Lost Samples: 0
#
# Samples: 942 of event 'cycles:pp'
# Event count (approx.): 175168972
#
# Overhead Command Shared Object Symbol
# ........ ............... ....................... .........................................
#
8.20% kworker/2:0 [kernel.kallsyms] [k] default_send_IPI_mask_allbutself_phys
6.33% swapper [kernel.kallsyms] [k] intel_idle
2.64% pidof [kernel.kallsyms] [k] arch_get_unmapped_area_topdown
2.56% pidof [kernel.kallsyms] [k] unmap_region
2.26% pidof [kernel.kallsyms] [k] memcpy
2.26% pidof libc-2.19.so [.] _IO_vfscanf
2.03% pidof [kernel.kallsyms] [k] lookup_fast
1.72% pidof [kernel.kallsyms] [k] filp_close
1.62% pidof [kernel.kallsyms] [k] apparmor_file_open
1.56% pidof [kernel.kallsyms] [k] process_measurement
1.50% pidof [kernel.kallsyms] [k] find_vma
<SNIP>

As shown above, perf record will dump samples to perf.data every time
it receives a SIGUSR2 signal in the snapshot mode.

Signed-off-by: Yunlong Song <yunlong.song@xxxxxxxxxx>
---
tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 170 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 199fc31..75606a6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,16 @@
#include <sched.h>
#include <sys/mman.h>

+static volatile int memory_enabled;
+static volatile int memory_signalled;
+/* The maximum size of one perf_event is 65536*/
+#define MEMORY_SIZE_MIN 65537
+
+struct memory {
+ void *start;
+ u64 head, tail;
+ u64 size;
+};

struct record {
struct perf_tool tool;
@@ -51,16 +61,134 @@ struct record {
bool no_buildid;
bool no_buildid_cache;
unsigned long long samples;
+ struct memory memory;
};

-static int record__write(struct record *rec, void *bf, size_t size)
+static int buf_to_file(struct record *rec, void *buf,
+ size_t size, u64 head, u64 tail)
{
- if (perf_data_file__write(rec->session->file, bf, size) < 0) {
- pr_err("failed to write perf data, error: %m\n");
+ size_t written = 0;
+
+ if (head < tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, tail - head) < 0)
+ goto out;
+ written += tail - head;
+ } else if (head > tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, size - head) < 0)
+ goto out;
+ written += size - head;
+
+ if (perf_data_file__write(rec->session->file, buf, tail) < 0)
+ goto out;
+ written += tail;
+ }
+
+ rec->bytes_written += written;
+ return 0;
+out:
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+}
+
+static int memory_to_file(struct record *rec)
+{
+ if (buf_to_file(rec, rec->memory.start, rec->memory.size,
+ rec->memory.head, rec->memory.tail) < 0)
return -1;
+ rec->memory.head = rec->memory.tail;
+
+ return 0;
+}
+
+static ssize_t perf_memory__write(struct memory *memory, void *buf, size_t size)
+{
+ void *buf_start = buf;
+ size_t left = size, written, delta, skip;
+ union perf_event *event;
+ struct perf_event_header hdr;
+ struct record *rec = container_of(memory, struct record, memory);
+
+ while (left) {
+ skip = 0;
+ written = min(left, memory->size - memory->tail);
+ if (memory->head > memory->tail)
+ delta = memory->head - memory->tail;
+ else
+ delta = memory->size - memory->tail + memory->head;
+ if (delta <= written) {
+ do {
+ if ((memory->head + skip) <= (memory->size -
+ sizeof(struct perf_event_header)))
+ event = (union perf_event *)(memory->start +
+ memory->head + skip);
+ else {
+ size_t hdr_left;
+
+ hdr_left = sizeof(struct perf_event_header) -
+ memory->size + memory->head + skip;
+ memcpy(&hdr, memory->start + memory->head + skip,
+ sizeof(struct perf_event_header) - hdr_left);
+
+ if (hdr_left <= memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, hdr_left);
+ else if (!memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ else {
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, memory->tail);
+ hdr_left -= memory->tail;
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ }
+
+ event = (union perf_event *)&hdr;
+ if (rec->session->header.needs_swap)
+ perf_event_header__bswap(&event->header);
+ }
+
+ if (event->header.type != PERF_RECORD_SAMPLE) {
+ if (buf_to_file(rec, memory->start, memory->size,
+ memory->head + skip, (memory->head + skip +
+ event->header.size) % memory->size) < 0)
+ return -1;
+ }
+
+ skip += event->header.size;
+ } while (skip <= written - delta);
+ }
+
+ memcpy(memory->start + memory->tail, buf, written);
+
+ memory->head = (memory->head + skip) % memory->size;
+ memory->tail = (memory->tail + written) % memory->size;
+
+ left -= written;
+ buf += written;
+ }
+
+ BUG_ON((size_t)(buf - buf_start) != size);
+ return size;
+}
+
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+ if (rec->memory.size && memory_enabled) {
+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
+ pr_err("failed to write memory data, error: %m\n");
+ return -1;
+ }
+ } else {
+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+ }
+ rec->bytes_written += size;
}

- rec->bytes_written += size;
return 0;
}

@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
if (old == head)
return 0;

+ memory_enabled = 1;
+
rec->samples++;

size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
md->prev = old;
perf_evlist__mmap_consume(rec->evlist, idx);
out:
+ memory_enabled = 0;
return rc;
}

@@ -426,8 +557,11 @@ static int record__mmap_read_all(struct record *rec)
* Mark the round finished in case we wrote
* at least one event.
*/
- if (bytes_written != rec->bytes_written)
+ if (bytes_written != rec->bytes_written) {
+ memory_enabled = 1;
rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
+ memory_enabled = 0;
+ }

out:
return rc;
@@ -492,7 +626,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
- if (rec->opts.auxtrace_snapshot_mode)
+ if (rec->opts.auxtrace_snapshot_mode || rec->memory.size)
signal(SIGUSR2, snapshot_sig_handler);
else
signal(SIGUSR2, SIG_IGN);
@@ -687,6 +821,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
}

+ if (memory_signalled) {
+ memory_signalled = 0;
+ if (memory_to_file(rec) < 0) {
+ err = -1;
+ goto out_child;
+ }
+ }
+
if (hits == rec->samples) {
if (done || draining)
break;
@@ -1009,6 +1151,12 @@ static struct record record = {
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
},
+ .memory = {
+ .start = NULL,
+ .head = 0,
+ .tail = 0,
+ .size = 0,
+ },
};

const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
@@ -1119,6 +1267,7 @@ struct option __record_options[] = {
OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
"options passed to clang when compiling BPF scriptlets"),
#endif
+ OPT_U64('M', "memory", &record.memory.size, "user space ring buffer memory size (bytes)"),
OPT_END()
};

@@ -1220,19 +1369,29 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
goto out_symbol_exit;
}

+ if (rec->memory.size) {
+ if (rec->memory.size < MEMORY_SIZE_MIN)
+ rec->memory.size = MEMORY_SIZE_MIN;
+ rec->memory.start = malloc(rec->memory.size);
+ }
+
err = __cmd_record(&record, argc, argv);
out_symbol_exit:
perf_evlist__delete(rec->evlist);
symbol__exit();
auxtrace_record__free(rec->itr);
+ if (rec->memory.size)
+ free(rec->memory.start);
return err;
}

static void snapshot_sig_handler(int sig __maybe_unused)
{
- if (!auxtrace_snapshot_enabled)
- return;
- auxtrace_snapshot_enabled = 0;
- auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
- auxtrace_record__snapshot_started = 1;
+ if (record.opts.auxtrace_snapshot_mode && auxtrace_snapshot_enabled) {
+ auxtrace_snapshot_enabled = 0;
+ auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+ auxtrace_record__snapshot_started = 1;
+ }
+ if (record.memory.size && !memory_signalled)
+ memory_signalled = 1;
}
--
1.8.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/