[RFC patch 1/2] sched: dynamically adapt granularity with nr_running

From: Mathieu Desnoyers
Date: Sat Sep 11 2010 - 13:40:48 EST


Changing the minimum granularity is a double-edged sword: if we set it to a too
small value, then the scheduler will preempt tasks too often. If it is too
large, then the "latency" period can grow very large as the number of running
tasks increases.

This patch leaves the same scheduling granularity when there are few tasks on
the system (3 or less), but dynamically adapts (shrinks) the sched granularity
when there are more. At a ceiling value of 8 running tasks (this choice is
arbitrary), it grows the latency rather than shrinking granularity further to
ensure we don't end up calling the scheduler too often.


(on a uniprocessor 2.0 GHz Pentium M)

* Without the patch:

- wakeup-latency with SIGEV_THREAD in parallel with youtube video and
make -j10

maximum latency: 50107.8 µs
average latency: 6609.2 µs
missed timer events: 0

- wakeup-latency with SIGEV_SIGNAL in parallel with youtube video and
make -j10

maximum latency: 8608.3 µs
average latency: 101.3 µs
missed timer events: 0


* With the patch

- wakeup-latency with SIGEV_THREAD in parallel with youtube video and
make -j10

maximum latency: 26367.4 µs
average latency: 5382.6 µs
missed timer events: 0

- wakeup-latency with SIGEV_SIGNAL in parallel with youtube video and
make -j10

maximum latency: 3030.4 µs
average latency: 129.3 µs
missed timer events: 0

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
---
kernel/sched_debug.c | 1 +
kernel/sched_fair.c | 39 +++++++++++++++++++++++++++++----------
2 files changed, 30 insertions(+), 10 deletions(-)

Index: linux-2.6-lttng.git/kernel/sched_fair.c
===================================================================
--- linux-2.6-lttng.git.orig/kernel/sched_fair.c
+++ linux-2.6-lttng.git/kernel/sched_fair.c
@@ -51,16 +51,23 @@ enum sched_tunable_scaling sysctl_sched_
= SCHED_TUNABLESCALING_LOG;

/*
- * Minimal preemption granularity for CPU-bound tasks:
+ * Minimum preemption granularity (when number of tasks increases).
+ */
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+
+/*
+ * Standard preemption granularity for CPU-bound tasks:
* (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
+unsigned int sysctl_sched_std_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_std_granularity = 2000000ULL;

/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ * is kept at sysctl_sched_latency / sysctl_sched_std_granularity
*/
static unsigned int sched_nr_latency = 3;
+static unsigned int sched_nr_latency_max = 8;

/*
* After fork, child runs first. If set to 0 (default) then
@@ -439,24 +446,36 @@ calc_delta_fair(unsigned long delta, str
/*
* The idea is to set a period in which each task runs once.
*
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
+ * When there are too many tasks (sysctl_sched_nr_latency) we have to shrink the
+ * slices, up to sysctl_sched_min_granularity.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
+ unsigned long nr_latency_max = sched_nr_latency_max;
u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;

- if (unlikely(nr_running > nr_latency)) {
+ if (unlikely(nr_running > nr_latency_max)) {
period = sysctl_sched_min_granularity;
period *= nr_running;
}
-
return period;
}

+static unsigned int __sched_gran(unsigned long nr_running)
+{
+ unsigned int gran = sysctl_sched_std_granularity;
+ unsigned long nr_latency = sched_nr_latency;
+
+ if (unlikely(nr_running > nr_latency)) {
+ gran = sysctl_sched_latency;
+ gran /= nr_running;
+ gran = max(gran, sysctl_sched_min_granularity);
+ }
+ return gran;
+}
+
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
@@ -862,7 +881,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
if (!sched_feat(WAKEUP_PREEMPT))
return;

- if (delta_exec < sysctl_sched_min_granularity)
+ if (delta_exec < __sched_gran(cfs_rq->nr_running))
return;

if (cfs_rq->nr_running > 1) {
Index: linux-2.6-lttng.git/kernel/sched_debug.c
===================================================================
--- linux-2.6-lttng.git.orig/kernel/sched_debug.c
+++ linux-2.6-lttng.git/kernel/sched_debug.c
@@ -331,6 +331,7 @@ static int sched_debug_show(struct seq_f
P(jiffies);
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_std_granularity);
PN(sysctl_sched_wakeup_granularity);
PN(sysctl_sched_child_runs_first);
P(sysctl_sched_features);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/