Re: [PATCH v7 07/20] perf record: Introduce data transferred and compressed stats
From: Bayduraev, Alexey V
Date: Wed Jun 30 2021 - 10:48:48 EST
On 30.06.2021 15:19, Bayduraev, Alexey V wrote:
>
> Hi,
>
> On 30.06.2021 11:05, Riccardo Mancini wrote:
>> Hi,
>>
>> On Tue, 2021-06-22 at 11:42 +0300, Alexey Bayduraev wrote:
>>> Introduce bytes_transferred and bytes_compressed stats so they
>>> would capture statistics for the related data buffer transfers.
>>>
>>> Acked-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
>>> Signed-off-by: Alexey Bayduraev <alexey.v.bayduraev@xxxxxxxxxxxxxxx>
>>> ---
>>> tools/perf/builtin-record.c | 64 +++++++++++++++++++++++++++++--------
>>> tools/perf/util/mmap.h | 3 ++
>>> 2 files changed, 54 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>>> index 38bb5afbb359..c9fd31211600 100644
>>> --- a/tools/perf/builtin-record.c
>>> +++ b/tools/perf/builtin-record.c
>>> @@ -198,6 +198,11 @@ static int record__write(struct record *rec, struct mmap
>>> *map __maybe_unused,
>>> return -1;
>>> }
>>>
>>> + if (map && map->file) {
>>> + map->bytes_written += size;
>>> + return 0;
>>> + }
>>> +
>>> rec->bytes_written += size;
>>>
>>> if (record__output_max_size_exceeded(rec) && !done) {
>>
>> This breaks the --max-size option in parallel trace mode.
>> Maybe, we could change record__output_max_size_exceeded to check all
>> thread_data->maps and sum up bytes_written. Concurrency should not be an issue
>> since it's not required to stop at exactly the max-size (this is also the
>> current behaviour).
>> Otherwise, we could atomically increase an accumulator inside struct record
>> (maybe rec->bytes_written could be reused, but I'm not sure if it's used by the
>> main thread in the parallel mode), and check it for exceeded max size.
>
> Thanks, currently threaded mode doesn't use rec->bytes_written and atomic64_t
> framework is not available in kernel/tools.
> I think better is to calculate sum of map->bytes_written like for threads
> statistics.
Just a little clarification: threaded mode uses rec->bytes_written for user
records (perf.data/data), whereas maps->bytes_written for per-cpu records
(perf.data/data.CPU).
Thus, the total amount of data transferred is their sum.
Regards,
Alexey
>
> Regards,
> Alexey
>
>>
>> Thanks,
>> Riccardo
>>
>>
>>> @@ -215,8 +220,8 @@ static int record__write(struct record *rec, struct mmap
>>> *map __maybe_unused,
>>>
>>> static int record__aio_enabled(struct record *rec);
>>> static int record__comp_enabled(struct record *rec);
>>> -static size_t zstd_compress(struct perf_session *session, void *dst, size_t
>>> dst_size,
>>> - void *src, size_t src_size);
>>> +static size_t zstd_compress(struct zstd_data *data,
>>> + void *dst, size_t dst_size, void *src, size_t
>>> src_size);
>>>
>>> #ifdef HAVE_AIO_SUPPORT
>>> static int record__aio_write(struct aiocb *cblock, int trace_fd,
>>> @@ -350,9 +355,13 @@ static int record__aio_pushfn(struct mmap *map, void *to,
>>> void *buf, size_t size
>>> */
>>>
>>> if (record__comp_enabled(aio->rec)) {
>>> - size = zstd_compress(aio->rec->session, aio->data + aio->size,
>>> - mmap__mmap_len(map) - aio->size,
>>> + struct zstd_data *zstd_data = &aio->rec->session->zstd_data;
>>> +
>>> + aio->rec->session->bytes_transferred += size;
>>> + size = zstd_compress(zstd_data,
>>> + aio->data + aio->size,
>>> mmap__mmap_len(map) - aio->size,
>>> buf, size);
>>> + aio->rec->session->bytes_compressed += size;
>>> } else {
>>> memcpy(aio->data + aio->size, buf, size);
>>> }
>>> @@ -577,8 +586,22 @@ static int record__pushfn(struct mmap *map, void *to,
>>> void *bf, size_t size)
>>> struct record *rec = to;
>>>
>>> if (record__comp_enabled(rec)) {
>>> - size = zstd_compress(rec->session, map->data,
>>> mmap__mmap_len(map), bf, size);
>>> + struct zstd_data *zstd_data = &rec->session->zstd_data;
>>> +
>>> + if (map->file) {
>>> + zstd_data = &map->zstd_data;
>>> + map->bytes_transferred += size;
>>> + } else {
>>> + rec->session->bytes_transferred += size;
>>> + }
>>> +
>>> + size = zstd_compress(zstd_data, map->data,
>>> mmap__mmap_len(map), bf, size);
>>> bf = map->data;
>>> +
>>> + if (map->file)
>>> + map->bytes_compressed += size;
>>> + else
>>> + rec->session->bytes_compressed += size;
>>> }
>>>
>>> thread->samples++;
>>> @@ -1311,18 +1334,15 @@ static size_t process_comp_header(void *record, size_t
>>> increment)
>>> return size;
>>> }
>>>
>>> -static size_t zstd_compress(struct perf_session *session, void *dst, size_t
>>> dst_size,
>>> +static size_t zstd_compress(struct zstd_data *zstd_data, void *dst, size_t
>>> dst_size,
>>> void *src, size_t src_size)
>>> {
>>> size_t compressed;
>>> size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct
>>> perf_record_compressed) - 1;
>>>
>>> - compressed = zstd_compress_stream_to_records(&session->zstd_data, dst,
>>> dst_size, src, src_size,
>>> + compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size,
>>> src, src_size,
>>> max_record_size,
>>> process_comp_header);
>>>
>>> - session->bytes_transferred += src_size;
>>> - session->bytes_compressed += compressed;
>>> -
>>> return compressed;
>>> }
>>>
>>> @@ -2006,8 +2026,10 @@ static int record__start_threads(struct record *rec)
>>>
>>> static int record__stop_threads(struct record *rec, unsigned long *waking)
>>> {
>>> - int t;
>>> + int t, tm;
>>> + struct mmap *map, *overwrite_map;
>>> struct thread_data *thread_data = rec->thread_data;
>>> + u64 bytes_written = 0, bytes_transferred = 0, bytes_compressed = 0;
>>>
>>> for (t = 1; t < rec->nr_threads; t++)
>>> record__terminate_thread(&thread_data[t]);
>>> @@ -2015,9 +2037,25 @@ static int record__stop_threads(struct record *rec,
>>> unsigned long *waking)
>>> for (t = 0; t < rec->nr_threads; t++) {
>>> rec->samples += thread_data[t].samples;
>>> *waking += thread_data[t].waking;
>>> - pr_debug("threads[%d]: samples=%lld, wakes=%ld,
>>> trasferred=%ld, compressed=%ld\n",
>>> + for (tm = 0; tm < thread_data[t].nr_mmaps; tm++) {
>>> + if (thread_data[t].maps) {
>>> + map = thread_data[t].maps[tm];
>>> + bytes_transferred += map->bytes_transferred;
>>> + bytes_compressed += map->bytes_compressed;
>>> + bytes_written += map->bytes_written;
>>> + }
>>> + if (thread_data[t].overwrite_maps) {
>>> + overwrite_map =
>>> thread_data[t].overwrite_maps[tm];
>>> + bytes_transferred += overwrite_map-
>>>> bytes_transferred;
>>> + bytes_compressed += overwrite_map-
>>>> bytes_compressed;
>>> + bytes_written += overwrite_map->bytes_written;
>>> + }
>>> + }
>>> + rec->session->bytes_transferred += bytes_transferred;
>>> + rec->session->bytes_compressed += bytes_compressed;
>>> + pr_debug("threads[%d]: samples=%lld, wakes=%ld,
>>> trasferred=%ld, compressed=%ld, written=%ld\n",
>>> thread_data[t].tid, thread_data[t].samples,
>>> thread_data[t].waking,
>>> - rec->session->bytes_transferred, rec->session-
>>>> bytes_compressed);
>>> + bytes_transferred, bytes_compressed, bytes_written);
>>> }
>>>
>>> return 0;
>>> diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
>>> index c4aed6e89549..c04ca4b5adf5 100644
>>> --- a/tools/perf/util/mmap.h
>>> +++ b/tools/perf/util/mmap.h
>>> @@ -46,6 +46,9 @@ struct mmap {
>>> int comp_level;
>>> struct perf_data_file *file;
>>> struct zstd_data zstd_data;
>>> + u64 bytes_transferred;
>>> + u64 bytes_compressed;
>>> + u64 bytes_written;
>>> };
>>>
>>> struct mmap_params {
>>
>>