Re: [PATCH 2/5] perf sched stats: Add record and rawdump support

From: Sapkal, Swapnil
Date: Fri Sep 27 2024 - 07:05:28 EST


Hello Namhyung,

Thank you for reviewing.

On 9/26/2024 11:42 AM, Namhyung Kim wrote:
On Mon, Sep 16, 2024 at 04:47:19PM +0000, Ravi Bangoria wrote:
From: Swapnil Sapkal <swapnil.sapkal@xxxxxxx>

Define new, perf tool only, sample types and their layouts. Add logic
to parse /proc/schedstat, convert it to perf sample format and save
samples to perf.data file with `perf sched stats record` command. Also
add logic to read perf.data file, interpret schedstat samples and
print rawdump of samples with `perf script -D`.

Note that, /proc/schedstat file output is standardized with version
number. The patch supports v15 but older or newer version can be added
easily.

Signed-off-by: Swapnil Sapkal <swapnil.sapkal@xxxxxxx>
Co-developed-by: Ravi Bangoria <ravi.bangoria@xxxxxxx>
Signed-off-by: Ravi Bangoria <ravi.bangoria@xxxxxxx>
---
tools/lib/perf/Documentation/libperf.txt | 2 +
tools/lib/perf/Makefile | 2 +-
tools/lib/perf/include/perf/event.h | 42 +++
.../lib/perf/include/perf/schedstat-cpu-v15.h | 13 +
.../perf/include/perf/schedstat-domain-v15.h | 40 +++
tools/perf/builtin-inject.c | 2 +
tools/perf/builtin-sched.c | 222 +++++++++++++++-
tools/perf/util/event.c | 98 +++++++
tools/perf/util/event.h | 2 +
tools/perf/util/session.c | 20 ++
tools/perf/util/synthetic-events.c | 249 ++++++++++++++++++
tools/perf/util/synthetic-events.h | 3 +
tools/perf/util/tool.c | 20 ++
tools/perf/util/tool.h | 4 +-
14 files changed, 716 insertions(+), 3 deletions(-)
create mode 100644 tools/lib/perf/include/perf/schedstat-cpu-v15.h
create mode 100644 tools/lib/perf/include/perf/schedstat-domain-v15.h

diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
index fcfb9499ef9c..39c78682ad2e 100644
--- a/tools/lib/perf/Documentation/libperf.txt
+++ b/tools/lib/perf/Documentation/libperf.txt
@@ -211,6 +211,8 @@ SYNOPSIS
struct perf_record_time_conv;
struct perf_record_header_feature;
struct perf_record_compressed;
+ struct perf_record_schedstat_cpu;
+ struct perf_record_schedstat_domain;
--
DESCRIPTION
diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 3a9b2140aa04..ebbfea891a6a 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -187,7 +187,7 @@ install_lib: libs
$(call do_install_mkdir,$(libdir_SQ)); \
cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
-HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
+HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h schedstat-cpu-v15.h schedstat-domain-v15.h
INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index 37bb7771d914..35be296d68d5 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -457,6 +457,44 @@ struct perf_record_compressed {
char data[];
};
+struct perf_record_schedstat_cpu_v15 {
+#define CPU_FIELD(_type, _name, _ver) _type _name;
+#include "schedstat-cpu-v15.h"
+#undef CPU_FIELD
+};
+
+struct perf_record_schedstat_cpu {
+ struct perf_event_header header;
+ __u16 version;
+ __u64 timestamp;
+ __u32 cpu;

Can you change the layout to minimize the paddings? Probably better to
add an explicit field for unused bits.

Sure, I will change.

+ union {
+ struct perf_record_schedstat_cpu_v15 v15;
+ };
+};
+
+struct perf_record_schedstat_domain_v15 {
+#define DOMAIN_FIELD(_type, _name, _ver) _type _name;
+#include "schedstat-domain-v15.h"
+#undef DOMAIN_FIELD
+};
+
+#define DOMAIN_NAME_LEN 16
+
+struct perf_record_schedstat_domain {
+ struct perf_event_header header;
+ __u16 version;
+ __u64 timestamp;
+ __u32 cpu;
+ __u16 domain;

Ditto.

Ack.

+ char name[DOMAIN_NAME_LEN];
+ union {
+ struct perf_record_schedstat_domain_v15 v15;
+ };
+ __u16 nr_cpus;
+ __u8 cpu_mask[];
+};
+
enum perf_user_event_type { /* above any possible kernel type */
PERF_RECORD_USER_TYPE_START = 64,
PERF_RECORD_HEADER_ATTR = 64,
@@ -478,6 +516,8 @@ enum perf_user_event_type { /* above any possible kernel type */
PERF_RECORD_HEADER_FEATURE = 80,
PERF_RECORD_COMPRESSED = 81,
PERF_RECORD_FINISHED_INIT = 82,
+ PERF_RECORD_SCHEDSTAT_CPU = 83,
+ PERF_RECORD_SCHEDSTAT_DOMAIN = 84,
PERF_RECORD_HEADER_MAX
};
@@ -518,6 +558,8 @@ union perf_event {
struct perf_record_time_conv time_conv;
struct perf_record_header_feature feat;
struct perf_record_compressed pack;
+ struct perf_record_schedstat_cpu schedstat_cpu;
+ struct perf_record_schedstat_domain schedstat_domain;
};
#endif /* __LIBPERF_EVENT_H */
diff --git a/tools/lib/perf/include/perf/schedstat-cpu-v15.h b/tools/lib/perf/include/perf/schedstat-cpu-v15.h
new file mode 100644
index 000000000000..8e4355ee3705
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-cpu-v15.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, v15)
+CPU_FIELD(__u32, array_exp, v15)
+CPU_FIELD(__u32, sched_count, v15)
+CPU_FIELD(__u32, sched_goidle, v15)
+CPU_FIELD(__u32, ttwu_count, v15)
+CPU_FIELD(__u32, ttwu_local, v15)
+CPU_FIELD(__u64, rq_cpu_time, v15)
+CPU_FIELD(__u64, run_delay, v15)
+CPU_FIELD(__u64, pcount, v15)
+#endif

Can we have a single schedstat.h containing both CPU fields and domain
fields?

Yes, I think it is possible to have a single schedstat.h for both CPU and domain fields. I will think more on this.


You might require users to define the macro always and get rid
of the ifdef condition here.
The later patches needed this ifdef's so I kept it. If we combine both
cpu and domain fields, we will need this.

Also is there any macro magic to handle the version number? I think you
can have the number only (15; without 'v') and compare with input if
needed..

I will think more on this, if it works out cleaner I will update in the next version.

Thanks,
Namhyung


diff --git a/tools/lib/perf/include/perf/schedstat-domain-v15.h b/tools/lib/perf/include/perf/schedstat-domain-v15.h
new file mode 100644
index 000000000000..422e713d617a
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-domain-v15.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef DOMAIN_FIELD
+DOMAIN_FIELD(__u32, idle_lb_count, v15)
+DOMAIN_FIELD(__u32, idle_lb_balanced, v15)
+DOMAIN_FIELD(__u32, idle_lb_failed, v15)
+DOMAIN_FIELD(__u32, idle_lb_imbalance, v15)
+DOMAIN_FIELD(__u32, idle_lb_gained, v15)
+DOMAIN_FIELD(__u32, idle_lb_hot_gained, v15)
+DOMAIN_FIELD(__u32, idle_lb_nobusyq, v15)
+DOMAIN_FIELD(__u32, idle_lb_nobusyg, v15)
+DOMAIN_FIELD(__u32, busy_lb_count, v15)
+DOMAIN_FIELD(__u32, busy_lb_balanced, v15)
+DOMAIN_FIELD(__u32, busy_lb_failed, v15)
+DOMAIN_FIELD(__u32, busy_lb_imbalance, v15)
+DOMAIN_FIELD(__u32, busy_lb_gained, v15)
+DOMAIN_FIELD(__u32, busy_lb_hot_gained, v15)
+DOMAIN_FIELD(__u32, busy_lb_nobusyq, v15)
+DOMAIN_FIELD(__u32, busy_lb_nobusyg, v15)
+DOMAIN_FIELD(__u32, newidle_lb_count, v15)
+DOMAIN_FIELD(__u32, newidle_lb_balanced, v15)
+DOMAIN_FIELD(__u32, newidle_lb_failed, v15)
+DOMAIN_FIELD(__u32, newidle_lb_imbalance, v15)
+DOMAIN_FIELD(__u32, newidle_lb_gained, v15)
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained, v15)
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq, v15)
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg, v15)
+DOMAIN_FIELD(__u32, alb_count, v15)
+DOMAIN_FIELD(__u32, alb_failed, v15)
+DOMAIN_FIELD(__u32, alb_pushed, v15)
+DOMAIN_FIELD(__u32, sbe_count, v15)
+DOMAIN_FIELD(__u32, sbe_balanced, v15)
+DOMAIN_FIELD(__u32, sbe_pushed, v15)
+DOMAIN_FIELD(__u32, sbf_count, v15)
+DOMAIN_FIELD(__u32, sbf_balanced, v15)
+DOMAIN_FIELD(__u32, sbf_pushed, v15)
+DOMAIN_FIELD(__u32, ttwu_wake_remote, v15)
+DOMAIN_FIELD(__u32, ttwu_move_affine, v15)
+DOMAIN_FIELD(__u32, ttwu_move_balance, v15)
+#endif /* DOMAIN_FIELD */
--
Thanks and Regards,
Swapnil