Re: [PATCH] sched: Make schedstats a runtime tunable that is disabled by default
From: Mel Gorman
Date: Mon Jan 25 2016 - 15:45:30 EST
On Mon, Jan 25, 2016 at 08:11:42PM +0000, Mel Gorman wrote:
> The vast majority of the checks are in the schedstat_[inc|dec|set) helpers.
> update_stats_wait_end() is an oddity in that it's a large function
> that only exists in schedstat and does a number of calculations. Ideally
> update_stats_wait would be updated too. In the next revision, I'll create
> a schedstats_enabled() helper that is an alias of static_branch_unlikely()
> and returns 0 if stats are not configured in for use at the call-site
> of large functions like this. That has the added benefit of avoiding any
> function call overhead.
>
Not even build tested but this is the general direction I'm thinking. It
adds the schedstat_enabled() helper and disables more schedstat-only code.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 87d40a72f6a1..846956abfe85 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3523,6 +3523,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
sched_debug [KNL] Enables verbose scheduler debug messages.
+ schedstats= [KNL,X86] Enable or disable scheduled statistics.
+ Allowed values are enable and disable. This feature
+ incurs a small amount of overhead in the scheduler
+ but is useful for debugging and performance tuning.
+
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
xtime_lock contention on larger systems, and/or RCU lock
contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a93b414672a7..be7c3b720adf 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -760,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued.
==============================================================
+schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
sg-big-buff:
This file shows the size of the generic SCSI (sg) buffer.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..4f080ab4f2cd 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
#endif /* _SCHED_SYSCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 63d3a24e081a..42ea7b28a47b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2093,7 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu);
stat:
- ttwu_stat(p, cpu, wake_flags);
+ if (schedstat_enabled())
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2141,7 +2142,8 @@ static void try_to_wake_up_local(struct task_struct *p)
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
+ if (schedstat_enabled())
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2210,6 +2212,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2281,6 +2284,61 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ set_schedstats(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_schedstats(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif
+#endif
+
/*
* fork()/clone()-time setup:
*/
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771ae6..79a3d91f554d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -75,16 +75,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
+ if (schedstat_enabled()) {
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
+ }
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
@@ -122,10 +124,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
#ifdef CONFIG_SCHEDSTATS
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.statistics.wait_sum),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ if (schedstat_enabled()) {
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(p->se.statistics.wait_sum),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ }
#else
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
0LL, 0L,
@@ -313,17 +317,19 @@ do { \
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
- P(yld_count);
+ if (schedstat_enabled()) {
+ P(yld_count);
- P(sched_count);
- P(sched_goidle);
+ P(sched_count);
+ P(sched_goidle);
#ifdef CONFIG_SMP
- P64(avg_idle);
- P64(max_idle_balance_cost);
+ P64(avg_idle);
+ P64(max_idle_balance_cost);
#endif
- P(ttwu_count);
- P(ttwu_local);
+ P(ttwu_count);
+ P(ttwu_local);
+ }
#undef P
#undef P64
@@ -569,38 +575,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
- PN(se.statistics.sum_sleep_runtime);
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
- P(se.nr_migrations);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
-
- {
+ if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
+ PN(se.statistics.sum_sleep_runtime);
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
+ P(se.nr_migrations);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
@@ -610,7 +616,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
avg_per_cpu = p->se.sum_exec_runtime;
if (p->se.nr_migrations) {
avg_per_cpu = div64_u64(avg_per_cpu,
- p->se.nr_migrations);
+ p->se.nr_migrations);
} else {
avg_per_cpu = -1LL;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1926606ece80..e6ae52fd203f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -755,7 +755,9 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ u64 delta;
+
+ delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
if (entity_is_task(se)) {
p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->statistics.wait_sum += delta;
se->statistics.wait_start = 0;
}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
@@ -801,8 +793,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Mark the end of the wait period if dequeueing a
@@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
+
+ if (flags & DEQUEUE_SLEEP) {
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ }
+ }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
}
+#endif
/*
* We are picking a new current task - update its stats:
@@ -3106,11 +3131,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
+ if (schedstat_enabled())
+ enqueue_sleeper(cfs_rq, se);
}
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
+ if (schedstat_enabled()) {
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ }
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3177,19 +3205,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
-#endif
- }
+ if (schedstat_enabled())
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3263,7 +3280,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ if (schedstat_enabled())
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
@@ -3276,7 +3294,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
@@ -3359,9 +3377,11 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
+ if (schedstat_enabled())
+ check_spread(cfs_rq, prev);
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
+ if (schedstat_enabled())
+ update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f16374df7f..1d583870e1a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1022,6 +1022,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void)
{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc7632de5..70b3b6a20fb0 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val) do { var = (val); } while (0)
+# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
+# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97715fd9e790..6fe70ccdf2ac 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
--
Mel Gorman
SUSE Labs