Re: [RFC PATCH 2/3] sched: add yield_to function
From: Peter Zijlstra
Date: Wed Dec 08 2010 - 15:00:55 EST
On Wed, 2010-12-08 at 12:55 -0500, Rik van Riel wrote:
>
>
> > Right, so another approach might be to simply swap the vruntime between
> > curr and p.
>
> Doesn't that run into the same scale issue you described
> above?
Not really, but its tricky on SMP because vruntime only has meaning
within a cfs_rq.
The below is quickly cobbled together from a few patches I had laying
about dealing with similar issues.
The avg_vruntime() stuff takes the weighted average of the vruntimes on
a cfs runqueue, this weighted average corresponds to the 0-lag point,
that is the point where an ideal scheduler would have all tasks.
Using the 0-lag point you can compute the lag of a task, the lag is a
measure of service for a task, negative lag means the task is owed
services, positive lag means its got too much service (at least, thats
the sign convention used here, I always forget what the standard is).
What we do is, when @p the target task, is owed less service than
current, we flip lags such that p will become more eligible.
The trouble with all this is that computing the weighted average is
terribly expensive (it increases cost of all scheduler hot paths).
The dyn_vruntime stuff mixed in there is an attempt at computing
something similar, although its not used and I haven't tested the
quality of the approximation in a while.
Anyway, complete untested and such..
---
include/linux/sched.h | 2 +
kernel/sched.c | 39 ++++++++++
kernel/sched_debug.c | 31 ++++-----
kernel/sched_fair.c | 179 ++++++++++++++++++++++++++++++++++++++---------
kernel/sched_features.h | 8 +--
5 files changed, 203 insertions(+), 56 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0fc8ee..538559e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1095,6 +1095,8 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
+
+ void (*yield_to) (struct task_struct *p);
};
struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 0debad9..fe0adb0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -313,6 +313,9 @@ struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
+ s64 avg_vruntime;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
@@ -5062,6 +5065,42 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+void yield_to(struct task_struct *p)
+{
+ struct task_struct *curr = current;
+ struct rq *p_rq, *rq;
+ unsigned long flags;
+ int on_rq;
+
+ local_irq_save(flags);
+ rq = this_rq();
+again:
+ p_rq = task_rq(p);
+ double_rq_lock(rq, p_rq);
+ if (p_rq != task_rq(p)) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ update_rq_clock(rq);
+ update_rq_clock(p_rq);
+
+ on_rq = p->se.on_rq;
+ if (on_rq)
+ dequeue_task(p_rq, p, 0);
+
+ ret = 0;
+ if (p->sched_class == curr->sched_class && curr->sched_class->yield_to)
+ curr->sched_class->yield_to(p);
+
+ if (on_rq)
+ enqueue_task(p_rq, p, 0);
+
+ double_rq_unlock(rq, p_rq);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
static inline int should_resched(void)
{
return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d..b5cc773 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -138,10 +138,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
+ struct sched_entity *last, *first;
unsigned long flags;
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
@@ -149,28 +148,26 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_lock_irqsave(&rq->lock, flags);
- if (cfs_rq->rb_leftmost)
- MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_vruntime = first->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_unlock_irqrestore(&rq->lock, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c886717..8689bcd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -346,25 +346,90 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
return se->vruntime - cfs_rq->min_vruntime;
}
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag: \Sum lag_i = 0
+ *
+ * lag_i = S_i - s_i = w_i * (V - v_i)
+ *
+ * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
+ *
+ * From which we solve V:
+ *
+ * \Sum v_i * w_i
+ * V = --------------
+ * \Sum w_i
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v) + v
+ *
+ * \Sum ((v_i - v) + v) * w_i \Sum (v_i - v) * w_i
+ * V = -------------------------- = -------------------- + v
+ * \Sum w_i \Sum w_i
+ *
+ * min_vruntime = v
+ * avg_vruntime = \Sum (v_i - v) * w_i
+ * cfs_rq->load = \Sum w_i
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 vruntime = cfs_rq->min_vruntime;
+ s64 key = entity_key(cfs_rq, se);
+ cfs_rq->avg_vruntime += key * se->load.weight;
+ cfs_rq->avg_load += se->load.weight;
+}
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 key = entity_key(cfs_rq, se);
+ cfs_rq->avg_vruntime -= key * se->load.weight;
+ cfs_rq->avg_load -= se->load.weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+static u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = cfs_rq->curr;
+ s64 lag = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
- if (!cfs_rq->curr)
- vruntime = se->vruntime;
- else
- vruntime = min_vruntime(vruntime, se->vruntime);
+ if (se) {
+ lag += entity_key(cfs_rq, se) * se->load.weight;
+ load += se->load.weight;
}
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+ if (load)
+ lag = div_s64(lag, load);
+
+ return cfs_rq->min_vruntime + lag;
+}
+
+static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ cfs_rq->min_vruntime = vruntime;
+ }
}
/*
@@ -378,6 +443,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;
+ avg_vruntime_add(cfs_rq, se);
+
/*
* Find the right place in the rbtree:
*/
@@ -417,9 +484,10 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ avg_vruntime_sub(cfs_rq, se);
}
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
@@ -542,6 +610,33 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_min_vruntime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+{
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 vruntime;
+
+ if (left && curr)
+ vruntime = min_vruntime(left->vruntime, curr->vruntime);
+ else if (left)
+ vruntime = left->vruntime;
+ else if (curr)
+ vruntime = curr->vruntime;
+ else
+ return;
+
+ if (sched_feat(DYN_MIN_VRUNTIME)) {
+ u64 new_vruntime = cfs_rq->min_vruntime;
+ if (delta_exec) {
+ new_vruntime += calc_delta_mine(delta_exec,
+ NICE_0_LOAD, &cfs_rq->load);
+ }
+ vruntime = max_vruntime(new_vruntime, vruntime);
+ }
+
+ __update_min_vruntime(cfs_rq, vruntime);
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -560,7 +655,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
+ update_min_vruntime(cfs_rq, delta_exec);
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
cfs_rq->load_unacc_exec_time += delta_exec;
@@ -895,16 +990,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
- u64 vruntime = cfs_rq->min_vruntime;
-
- /*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
- */
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
+ u64 vruntime = avg_vruntime(cfs_rq);
/* sleeps up to a single latency don't count. */
if (!initial) {
@@ -934,7 +1020,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
- se->vruntime += cfs_rq->min_vruntime;
+ se->vruntime += avg_vruntime(cfs_rq);
/*
* Update run-time statistics of the 'current'.
@@ -1003,7 +1089,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
- update_min_vruntime(cfs_rq);
+ update_min_vruntime(cfs_rq, 0);
update_cfs_shares(cfs_rq, 0);
/*
@@ -1012,7 +1098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
+ se->vruntime -= avg_vruntime(cfs_rq);
}
/*
@@ -1090,7 +1176,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
@@ -1326,7 +1412,7 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- se->vruntime -= cfs_rq->min_vruntime;
+ se->vruntime -= avg_vruntime(cfs_rq);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4025,7 +4111,7 @@ static void task_fork_fair(struct task_struct *p)
resched_task(rq->curr);
}
- se->vruntime -= cfs_rq->min_vruntime;
+ se->vruntime -= avg_vruntime(cfs_rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -4096,10 +4182,10 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* fair sleeper stuff for the first placement, but who cares.
*/
if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ p->se.vruntime -= avg_vruntime(cfs_rq_of(&p->se));
set_task_rq(p, task_cpu(p));
if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ p->se.vruntime += avg_vruntime(cfs_rq_of(&p->se));
}
#endif
@@ -4118,6 +4204,31 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
return rr_interval;
}
+static void yield_to_fair(struct task_stuct *p)
+{
+ struct sched_entity *se = ¤t->se;
+ struct sched_entity *p_se = &p->se;
+ u64 lag0, p_lag0;
+ s64 lag, p_lag;
+
+ lag0 = avg_vruntime(cfs_rq_of(se));
+ p_lag0 = avg_vruntime(cfs_rq_of(p_se));
+
+ lag = se->vruntime - avg_vruntime(cfs_rq);
+ p_lag = p_se->vruntime - avg_vruntime(p_cfs_rq);
+
+ if (p_lag > lag) { /* if P is owed less service */
+ se->vruntime = lag0 + p_lag;
+ p_se->vruntime = p_lag + lag;
+ }
+
+ /*
+ * XXX try something smarter here
+ */
+ resched_task(p);
+ resched_task(current);
+}
+
/*
* All the scheduling class methods:
*/
@@ -4150,6 +4261,8 @@ static const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
+ .yield_to = yield_to_fair,
+
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69ac..feda9b8 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -6,12 +6,6 @@
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
- */
-SCHED_FEAT(START_DEBIT, 1)
-
-/*
* Should wakeups try to preempt running tasks.
*/
SCHED_FEAT(WAKEUP_PREEMPT, 1)
@@ -53,6 +47,8 @@ SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(LB_BIAS, 1)
+SCHED_FEAT(DYN_MIN_VRUNTIME, 0)
+
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
* another cpu -- assumes that when the owner is running, it will soon
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/