[PATCH 7/7] sched: fair: vruntime spread

From: Peter Zijlstra
Date: Mon Feb 18 2008 - 05:00:04 EST


By its very nature we try to converge vruntime between tasks. This makes it
very hard to interleave the groups that have varying latency requirements,
they end up in a single 'lump'. Avoid this by introducing an artificial
vruntime offset:

A1 |--------------|
A2 |--------------|
A3 |--------------|

New |--------------|

Because tasks have no stable (dense) enumeration within a group
and we'd want the tasks evenly spaced within the period in a regular
fashion, we use an ascending iterator (nr_iter).

This ensures that in a steady state, each task will have the same offset

However, when a new task gets inserted we cannot re-adjust all offsets,
hence we will approximate by inserting the new task at p(1-1/n).
This is why account_entity_enqueue() sets nr_iter to nr_running-1.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
include/linux/sched.h | 1
kernel/sched.c | 2 +
kernel/sched_fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 55 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -923,6 +923,7 @@ struct sched_entity {
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
+ u64 voffset;
u64 prev_sum_exec_runtime;

#ifdef CONFIG_SCHEDSTATS
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -220,9 +220,11 @@ static inline u64 min_vruntime(u64 min_v

static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return se->vruntime - cfs_rq->min_vruntime;
+ return se->vruntime + se->voffset - cfs_rq->min_vruntime;
}

+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* Enqueue an entity into the rb-tree:
*/
@@ -240,6 +242,8 @@ static void __enqueue_entity(struct cfs_
if (se == cfs_rq->curr)
return;

+ se->voffset = sched_voffset(cfs_rq, se);
+
cfs_rq = &rq_of(cfs_rq)->cfs;

link = &cfs_rq->tasks_timeline.rb_node;
@@ -387,7 +391,7 @@ out:
*
* vs = s*rw/w = p
*/
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;

@@ -397,6 +401,46 @@ static u64 sched_vslice_add(struct cfs_r
return __sched_period(nr_running);
}

+/*
+ * By its very nature we try to converge vruntime between tasks. This makes it
+ * very hard to interleave the groups that have varying latency requirements,
+ * they end up in a single 'lump'. Avoid this by introducing an artificial
+ * vruntime offset:
+ *
+ * A1 |--------------|
+ * A2 |--------------|
+ * A3 |--------------|
+ *
+ * New |--------------|
+ *
+ * Because tasks have no stable (dense) enumeration within a group [*]
+ * and we'd want the tasks evenly spaced within the period in a regular
+ * fashion, we use an ascending iterator (nr_iter).
+ *
+ * This ensures that in a steady state, each task will have the same offset
+ *
+ * However, when a new task gets inserted we cannot re-adjust all offsets,
+ * hence we will approximate by inserting the new task at p(1-1/n).
+ * This is why account_entity_enqueue() sets nr_iter to nr_running-1.
+ *
+ * [*] it would be possible to arrange for one, but it seems unnecessarily
+ * complex, esp. as we still can't re-adjust all tasks on insert.
+ */
+static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * approximate p/n with min_granularity
+ */
+ u64 frac = sysctl_sched_min_granularity;
+
+ frac *= cfs_rq->nr_iter;
+ cfs_rq->nr_iter++;
+ if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+ cfs_rq->nr_iter = 0;
+
+ return frac;
+}
+
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
@@ -564,6 +608,7 @@ static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
+ cfs_rq->nr_iter = cfs_rq->nr_running;
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
@@ -574,6 +619,8 @@ account_entity_dequeue(struct cfs_rq *cf
{
update_load_sub(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running--;
+ if (cfs_rq->nr_iter >= cfs_rq->nr_running)
+ cfs_rq->nr_iter = 0;
se->on_rq = 0;
list_del_init(&se->group_node);
}
@@ -656,7 +703,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice_add(cfs_rq, se);
+ vruntime += sched_vslice(cfs_rq, se);

if (!initial) {
/* sleeps upto a single latency don't count. */
@@ -678,6 +725,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
*/
update_curr(cfs_rq);

+ account_entity_enqueue(cfs_rq, se);
+
if (wakeup) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
@@ -686,7 +735,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
__enqueue_entity(cfs_rq, se);
- account_entity_enqueue(cfs_rq, se);
}

static void
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -425,6 +425,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long nr_running;

+ unsigned long nr_iter;
+
u64 exec_clock;
u64 min_vruntime;


--

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/