Re: [PATCH 6/8] sched: avg_vruntime

From: Fabio Checconi
Date: Sat Nov 01 2008 - 15:04:24 EST


Hi,

> From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> Date: Wed, Oct 29, 2008 04:48:34PM +0100
>
> On Fri, 2008-10-24 at 11:06 +0200, Peter Zijlstra wrote:
...
> How about this..
>
> The fluid model, would for each task t_i, generate an execution time e_i
>
> de_i = w_i / w_sum * dt
>
> However, any real scheduler will be imperfect and have an error eps_i
>
> dE_i = de_i + eps_i,
>

...according to this equation...


> But due to only dt actual time having past we can state that
>
> \Sum_i dE_i = dt, therefore \Sum_i eps_i = 0.
>
> This will be reflected in a virtual runtime skew of
>
> dv_i = eps_i / w_i
>

...and to this one, what you call ``virtual runtime skew'' is:

dv_i = (dE_i - de_i) / w_i.


> If we now wish to obtain the zero lag point, there were all tasks would
> be in the fluid model, we get
>
> eps_i = dv_i * w_i, which yields: \Sum dv_i * w_i = 0
>
> IOW avg(v_i*w_i) = v_fluid
>

Looking at the code, it seems that you use the vruntime values of the
entities when you do the average, which are different from what you
previously called ``virtual runtime skew.'' I don't understand the
connection between the previous dv_i and the v_i there. Calling dVR_i
the vruntime increment for the i-th entity, dVR_i = dE_i / w_i, which
clearly differs from dv_i.

Moreover, v_fluid (considering all the flows backlogged from the beginning
and the set of active flows constant) is defined as:

v_fluid = 1 / w_sum * \sum w_i * VR_i,

so, unless w_sum == N, this differs from your expression for v_fluid.
Am I missing something there?


> 1/n \Sum_i v_i*w_i, [v_i -> v_i-x] ->
> 1/n \sum_i (v_i-x)*w_i =
> 1/n \Sum v_i*w_i - \Sum x*w_i =
> 1/n \Sum v_i*w_i - x \Sum w_i
>
> which in turn would yield a patch like below..
>
> I'll also try and quantify the error and effect of using min_vruntime as
> zero lag point as Ingo suggested.
>

min_vruntime, given its definition, is very likely to be near to the
maximum lag point...


> ---
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c 2008-10-29 16:43:16.000000000 +0100
> +++ linux-2.6/kernel/sched.c 2008-10-29 16:43:27.000000000 +0100
> @@ -384,6 +384,10 @@ struct cfs_rq {
> struct load_weight load;
> unsigned long nr_running;
>
> + long nr_queued;
> + long avg_load;
> + s64 avg_vruntime;
> +
> u64 exec_clock;
> u64 min_vruntime;
>
> Index: linux-2.6/kernel/sched_debug.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_debug.c 2008-10-29 16:43:04.000000000 +0100
> +++ linux-2.6/kernel/sched_debug.c 2008-10-29 16:43:37.000000000 +0100
> @@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
> SPLIT_NS(spread0));
> SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
> SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
> + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
> + SPLIT_NS(avg_vruntime(cfs_rq)));
> +
> #ifdef CONFIG_SCHEDSTATS
> #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
>
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c 2008-10-29 16:43:17.000000000 +0100
> +++ linux-2.6/kernel/sched_fair.c 2008-10-29 16:46:41.000000000 +0100
> @@ -271,6 +271,60 @@ static inline s64 entity_key(struct cfs_
> return se->vruntime - cfs_rq->min_vruntime;
> }
>
> +static void
> +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + s64 key = entity_key(cfs_rq, se);
> + cfs_rq->avg_load += se->load.weight;
> + cfs_rq->avg_vruntime += key * se->load.weight;
> + cfs_rq->nr_queued++;
> +}
> +
> +static void
> +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + s64 key = entity_key(cfs_rq, se);
> + cfs_rq->avg_load -= se->load.weight;
> + cfs_rq->avg_vruntime -= key * se->load.weight;
> + cfs_rq->nr_queued--;
> +}
> +
> +static inline
> +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
> +{
> + cfs_rq->avg_vruntime -= cfs_rq->nr_queued * cfs_rq->avg_load * delta;
> +}
> +
> +static u64 avg_vruntime(struct cfs_rq *cfs_rq)
> +{
> + s64 avg = cfs_rq->avg_vruntime;
> + long nr_queued = cfs_rq->nr_queued;
> +
> + if (cfs_rq->curr) {
> + nr_queued++;
> + avg += entity_key(cfs_rq, cfs_rq->curr) * cfs_rq->curr->load.weight;
> + }
> +
> + avg >>= NICE_0_SHIFT;
> +
> + if (nr_queued)
> + avg = div_s64(avg, nr_queued);
> +
> + return cfs_rq->min_vruntime + avg;
> +}
> +
> +static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
> +{
> + /*
> + * open coded max_vruntime() to allow updating avg_vruntime
> + */
> + s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
> + if (delta > 0) {
> + avg_vruntime_update(cfs_rq, delta);
> + cfs_rq->min_vruntime = vruntime;
> + }
> +}
> +
> static void update_min_vruntime(struct cfs_rq *cfs_rq)
> {
> u64 vruntime = cfs_rq->min_vruntime;
> @@ -289,7 +343,7 @@ static void update_min_vruntime(struct c
> vruntime = min_vruntime(vruntime, se->vruntime);
> }
>
> - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
> + __update_min_vruntime(cfs_rq, vruntime);
> }
>
> /*
> @@ -303,6 +357,8 @@ static void __enqueue_entity(struct cfs_
> s64 key = entity_key(cfs_rq, se);
> int leftmost = 1;
>
> + avg_vruntime_add(cfs_rq, se);
> +
> /*
> * Find the right place in the rbtree:
> */
> @@ -345,6 +401,7 @@ static void __dequeue_entity(struct cfs_
> cfs_rq->next = NULL;
>
> rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
> + avg_vruntime_sub(cfs_rq, se);
> }
>
> static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/