[PATCH 5/5] sched/debug: decouple 'sched_stat_*' tracepoints' from CONFIG_SCHEDSTATS

From: Josh Poimboeuf
Date: Fri Jun 17 2016 - 13:44:22 EST


The 'sched_stat_*' tracepoints have a dependency on schedstats being
enabled at both compile-time (CONFIG_SCHEDSTATS) and runtime
(sched_schedstats).

In addition to causing increased code complexity, it also causes
confusion:

- When schedstats are disabled, the tracepoints don't fire at all.

- When schedstats are enabled at runtime, existing tasks will have stale
or missing statistics, which can cause the tracepoints to either not
fire, or to fire with corrupt data.

Decouple them so that the tracepoints will always fire correctly,
independent of schedstats. A side benefit is that this also means that
latencytop and profiling no longer have a dependency on schedstats.

Code size:

!CONFIG_SCHEDSTATS defconfig:

text data bss dec hex filename
10209818 4368184 1105920 15683922 ef5152 vmlinux.before.nostats
10209818 4368312 1105920 15684050 ef51d2 vmlinux.after.nostats

CONFIG_SCHEDSTATS defconfig:

text data bss dec hex filename
10214210 4370680 1105920 15690810 ef6c3a vmlinux.before.stats
10214261 4370104 1105920 15690285 ef6a2d vmlinux.after.stats

Suggested-by: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
---
include/linux/sched.h | 11 +++----
kernel/latencytop.c | 2 --
kernel/profile.c | 5 ----
kernel/sched/core.c | 16 +++--------
kernel/sched/debug.c | 13 +++++----
kernel/sched/fair.c | 80 ++++++++++++---------------------------------------
lib/Kconfig.debug | 1 -
7 files changed, 34 insertions(+), 94 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e31758b..0ee8f04 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -940,10 +940,6 @@ static inline int sched_info_on(void)
#endif
}

-#ifdef CONFIG_SCHEDSTATS
-void force_schedstat_enabled(void);
-#endif
-
enum cpu_idle_type {
CPU_IDLE,
CPU_NOT_IDLE,
@@ -1285,18 +1281,15 @@ struct sched_avg {

#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
- u64 wait_start;
u64 wait_max;
u64 wait_count;
u64 wait_sum;
u64 iowait_count;
u64 iowait_sum;

- u64 sleep_start;
u64 sleep_max;
s64 sum_sleep_runtime;

- u64 block_start;
u64 block_max;
u64 exec_max;
u64 slice_max;
@@ -1332,6 +1325,10 @@ struct sched_entity {

u64 nr_migrations;

+ u64 wait_start;
+ u64 sleep_start;
+ u64 block_start;
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b5c30d9..5ff6c54 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -296,8 +296,6 @@ int sysctl_latencytop(struct ctl_table *table, int write,
int err;

err = proc_dointvec(table, write, buffer, lenp, ppos);
- if (latencytop_enabled)
- force_schedstat_enabled();

return err;
}
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9..f033d79 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -58,8 +58,6 @@ int profile_setup(char *str)
int par;

if (!strncmp(str, sleepstr, strlen(sleepstr))) {
-#ifdef CONFIG_SCHEDSTATS
- force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
@@ -67,9 +65,6 @@ int profile_setup(char *str)
prof_shift = par;
pr_info("kernel sleep profiling enabled (shift: %ld)\n",
prof_shift);
-#else
- pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
-#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
if (str[strlen(schedstr)] == ',')
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0aee5dd..9befffb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2267,14 +2267,6 @@ static void set_schedstats(bool enabled)
static_branch_disable(&sched_schedstats);
}

-void force_schedstat_enabled(void)
-{
- if (!schedstat_enabled()) {
- pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
- static_branch_enable(&sched_schedstats);
- }
-}
-
static int __init setup_schedstats(char *str)
{
int ret = 0;
@@ -7589,10 +7581,10 @@ void normalize_rt_tasks(void)
if (p->flags & PF_KTHREAD)
continue;

- p->se.exec_start = 0;
- schedstat_set(p->se.statistics.wait_start, 0);
- schedstat_set(p->se.statistics.sleep_start, 0);
- schedstat_set(p->se.statistics.block_start, 0);
+ p->se.exec_start = 0;
+ p->se.wait_start = 0;
+ p->se.sleep_start = 0;
+ p->se.block_start = 0;

if (!dl_task(p) && !rt_task(p)) {
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1393588..06be44a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -382,10 +382,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
+ PN(se->wait_start);
+ PN(se->sleep_start);
+ PN(se->block_start);
if (schedstat_enabled()) {
- PN_SCHEDSTAT(se->statistics.wait_start);
- PN_SCHEDSTAT(se->statistics.sleep_start);
- PN_SCHEDSTAT(se->statistics.block_start);
PN_SCHEDSTAT(se->statistics.sleep_max);
PN_SCHEDSTAT(se->statistics.block_max);
PN_SCHEDSTAT(se->statistics.exec_max);
@@ -887,13 +887,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)

P(se.nr_migrations);

+ PN(se.wait_start);
+ PN(se.sleep_start);
+ PN(se.block_start);
+
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;

PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
- PN_SCHEDSTAT(se.statistics.wait_start);
- PN_SCHEDSTAT(se.statistics.sleep_start);
- PN_SCHEDSTAT(se.statistics.block_start);
PN_SCHEDSTAT(se.statistics.sleep_max);
PN_SCHEDSTAT(se.statistics.block_max);
PN_SCHEDSTAT(se.statistics.exec_max);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae70600..3c85949 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -792,19 +792,15 @@ static void update_curr_fair(struct rq *rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start, prev_wait_start;
-
- if (!schedstat_enabled())
- return;
+ u64 wait_start;

wait_start = rq_clock(rq_of(cfs_rq));
- prev_wait_start = schedstat_val(se->statistics.wait_start);

if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > prev_wait_start))
- wait_start -= prev_wait_start;
+ likely(wait_start > se->wait_start))
+ wait_start -= se->wait_start;

- schedstat_set(se->statistics.wait_start, wait_start);
+ se->wait_start = wait_start;
}

static inline void
@@ -813,10 +809,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
struct task_struct *p;
u64 delta;

- if (!schedstat_enabled())
- return;
-
- delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
+ delta = rq_clock(rq_of(cfs_rq)) - se->wait_start;

if (entity_is_task(se)) {
p = task_of(se);
@@ -826,44 +819,38 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- schedstat_set(se->statistics.wait_start, delta);
+ se->wait_start = delta;
return;
}
trace_sched_stat_wait(p, delta);
}

+ se->wait_start = 0;
schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
schedstat_inc(se->statistics.wait_count);
schedstat_add(se->statistics.wait_sum, delta);
- schedstat_set(se->statistics.wait_start, 0);
}

static inline void
update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *tsk = NULL;
- u64 sleep_start, block_start;
-
- if (!schedstat_enabled())
- return;
-
- sleep_start = schedstat_val(se->statistics.sleep_start);
- block_start = schedstat_val(se->statistics.block_start);

if (entity_is_task(se))
tsk = task_of(se);

- if (sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+ if (se->sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->sleep_start;

if ((s64)delta < 0)
delta = 0;

- if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ if (schedstat_enabled() &&
+ unlikely(delta > schedstat_val(se->statistics.sleep_max)))
schedstat_set(se->statistics.sleep_max, delta);

- schedstat_set(se->statistics.sleep_start, 0);
+ se->sleep_start = 0;
schedstat_add(se->statistics.sum_sleep_runtime, delta);

if (tsk) {
@@ -871,16 +858,17 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
trace_sched_stat_sleep(tsk, delta);
}
}
- if (block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+ if (se->block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->block_start;

if ((s64)delta < 0)
delta = 0;

- if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ if (schedstat_enabled() &&
+ unlikely(delta > schedstat_val(se->statistics.block_max)))
schedstat_set(se->statistics.block_max, delta);

- schedstat_set(se->statistics.block_start, 0);
+ se->block_start = 0;
schedstat_add(se->statistics.sum_sleep_runtime, delta);

if (tsk) {
@@ -913,9 +901,6 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- if (!schedstat_enabled())
- return;
-
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
@@ -931,9 +916,6 @@ static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{

- if (!schedstat_enabled())
- return;
-
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
@@ -945,11 +927,9 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);

if (tsk->state & TASK_INTERRUPTIBLE)
- schedstat_set(se->statistics.sleep_start,
- rq_clock(rq_of(cfs_rq)));
+ se->sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- schedstat_set(se->statistics.block_start,
- rq_clock(rq_of(cfs_rq)));
+ se->block_start = rq_clock(rq_of(cfs_rq));
}
}

@@ -3211,27 +3191,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

-static inline void check_schedstat_required(void)
-{
-#ifdef CONFIG_SCHEDSTATS
- if (schedstat_enabled())
- return;
-
- /* Force schedstat enabled if a dependent tracepoint is active */
- if (trace_sched_stat_wait_enabled() ||
- trace_sched_stat_sleep_enabled() ||
- trace_sched_stat_iowait_enabled() ||
- trace_sched_stat_blocked_enabled() ||
- trace_sched_stat_runtime_enabled()) {
- printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
- "stat_blocked and stat_runtime require the "
- "kernel parameter schedstats=enabled or "
- "kernel.sched_schedstats=1\n");
- }
-#endif
-}
-
-
/*
* MIGRATION
*
@@ -3293,7 +3252,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);

- check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b9cfdbf..e8adfac 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1699,7 +1699,6 @@ config LATENCYTOP
select KALLSYMS
select KALLSYMS_ALL
select STACKTRACE
- select SCHEDSTATS
select SCHED_DEBUG
help
Enable this option if you want to use the LatencyTOP tool
--
2.4.11