Re: [PATCH 0/5] sched/debug: decouple sched_stat tracepoints from CONFIG_SCHEDSTATS

From: Peter Zijlstra
Date: Tue Jun 28 2016 - 08:43:46 EST


On Fri, Jun 17, 2016 at 12:43:22PM -0500, Josh Poimboeuf wrote:
> NOTE: I didn't include any performance numbers because I wasn't able to
> get consistent results. I tried the following on a Xeon E5-2420 v2 CPU:
>
> $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo -n performance > $i; done
> $ echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
> $ echo 100 > /sys/devices/system/cpu/intel_pstate/min_perf_pct
> $ echo 0 > /proc/sys/kernel/nmi_watchdog
> $ taskset 0x10 perf stat -n -r10 perf bench sched pipe -l 1000000
>
> I was going to post the numbers from that, both with and without
> SCHEDSTATS, but then when I tried to repeat the test on a different day,
> the results were surprisingly different, with different conclusions.
>
> So any advice on measuring scheduler performance would be appreciated...

Yeah, its a bit of a pain in general...

A) perf stat --null --repeat 50 -- perf bench sched messaging -g 50 -l 5000 | grep "seconds time elapsed"
B) perf stat --null --repeat 50 -- taskset 1 perf bench sched pipe | grep "seconds time elapsed"

1) tip/master + 1-4
2) tip/master + 1-5
3) tip/master + 1-5 + below

1 2 3

A) 4.627767855 4.650429917 4.646208062
4.633921933 4.641424424 4.612021058
4.649536375 4.663144144 4.636815948
4.630165619 4.649053552 4.613022902

B) 1.770732957 1.789534273 1.773334291
1.761740716 1.795618428 1.773338681
1.763761666 1.822316496 1.774385589


>From this it looks like patch 5 does hurt a wee bit, but we can get most
of that back by reordering the structure a bit. The results seem
'stable' across rebuilds and reboots (I've pop'ed all patches and
rebuild, rebooted and re-benched 1 at the end and obtained similar
results).

Although, possible that if we reorder first and then do 5, we'll just
see a bigger regression. I've not bothered.


---
include/linux/sched.h | 33 +++++++++++++++------------------
kernel/sched/core.c | 4 ++--
kernel/sched/debug.c | 6 +++---
3 files changed, 20 insertions(+), 23 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,7 @@ struct uts_namespace;
struct load_weight {
unsigned long weight;
u32 inv_weight;
-};
+} __packed;

/*
* The load_avg/util_avg accumulates an infinite geometric series
@@ -1315,44 +1315,40 @@ struct sched_statistics {

struct sched_entity {
struct load_weight load; /* for load-balancing */
+ unsigned int on_rq;
struct rb_node run_node;
struct list_head group_node;
- unsigned int on_rq;

- u64 exec_start;
+ u64 exec_start ____cacheline_aligned_in_smp;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
-
- u64 nr_migrations;
-
u64 wait_start;
u64 sleep_start;
u64 block_start;

+#ifdef CONFIG_SMP
+ /*
+ * Per entity load average tracking.
+ */
+ struct sched_avg avg ____cacheline_aligned_in_smp;
+#endif
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
- int depth;
+ /*
+ * mostly constant values, separate from modifications above
+ */
+ int depth ____cacheline_aligned_in_smp;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
-
-#ifdef CONFIG_SMP
- /*
- * Per entity load average tracking.
- *
- * Put into separate cache line so it does not
- * collide with read-mostly values above.
- */
- struct sched_avg avg ____cacheline_aligned_in_smp;
-#endif
-};
+} ____cacheline_aligned_in_smp;

struct sched_rt_entity {
struct list_head run_list;
@@ -1475,6 +1471,7 @@ struct task_struct {
int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
+ u64 nr_migrations;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1239,7 +1239,7 @@ void set_task_cpu(struct task_struct *p,
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p);
- p->se.nr_migrations++;
+ p->nr_migrations++;
perf_event_task_migrate(p);
}

@@ -2167,7 +2167,7 @@ static void __sched_fork(unsigned long c
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
- p->se.nr_migrations = 0;
+ p->nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,7 +885,7 @@ void proc_sched_show_task(struct task_st

nr_switches = p->nvcsw + p->nivcsw;

- P(se.nr_migrations);
+ P(nr_migrations);

PN(se.wait_start);
PN(se.sleep_start);
@@ -926,9 +926,9 @@ void proc_sched_show_task(struct task_st
avg_atom = -1LL;

avg_per_cpu = p->se.sum_exec_runtime;
- if (p->se.nr_migrations) {
+ if (p->nr_migrations) {
avg_per_cpu = div64_u64(avg_per_cpu,
- p->se.nr_migrations);
+ p->nr_migrations);
} else {
avg_per_cpu = -1LL;
}