[RFC][PATCH 3/6] core changes in CFS

From: Srivatsa Vaddagiri
Date: Mon Jun 11 2007 - 11:45:32 EST

Next message: Xavier Bestel: "Re: A kexec approach to hibernation"
Previous message: Srivatsa Vaddagiri: "[RFC][PATCH 2/6] task's cpu information needs to be always correct"
In reply to: Balbir Singh: "Re: [ckrm-tech] [RFC][PATCH 2/6] task's cpu information needs tobe always correct"
Next in thread: Balbir Singh: "Re: [RFC][PATCH 3/6] core changes in CFS"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This patch introduces core changes in CFS work to operate on generic
schedulable entities. The task specific operations (like enqueue, dequeue,
task_tick etc) is then rewritten to work off this generic CFS "library".

Signed-off-by : Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxxxxxx>

---
kernel/sched_debug.c | 2
kernel/sched_fair.c | 574 ++++++++++++++++++++++++++++++---------------------
2 files changed, 345 insertions(+), 231 deletions(-)

Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c 2007-06-09 15:07:16.000000000 +0530
+++ current/kernel/sched_fair.c 2007-06-09 15:07:33.000000000 +0530
@@ -42,19 +42,54 @@

extern struct sched_class fair_sched_class;

+/******************************************************************************/
+/* BEGIN : CFS operations on generic schedulable entities */
+/******************************************************************************/
+
+static inline struct rq *lrq_rq(struct lrq *lrq)
+{
+ return container_of(lrq, struct rq, lrq);
+}
+
+static inline struct sched_entity *lrq_curr(struct lrq *lrq)
+{
+ struct rq *rq = lrq_rq(lrq);
+ struct sched_entity *se = NULL;
+
+ if (rq->curr->sched_class == &fair_sched_class)
+ se = &rq->curr->se;
+
+ return se;
+}
+
+static long lrq_nr_running(struct lrq *lrq)
+{
+ struct rq *rq = lrq_rq(lrq);
+
+ return rq->nr_running;
+}
+
+#define entity_is_task(se) 1
+
+static inline struct task_struct *entity_to_task(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+
/**************************************************************/
/* Scheduling class tree data structure manipulation methods:
*/

/*
- * Enqueue a task into the rb-tree:
+ * Enqueue a entity into the rb-tree:
*/
-static inline void __enqueue_task_fair(struct rq *rq, struct task_struct *p)
+static inline void __enqueue_entity(struct lrq *lrq, struct sched_entity *p)
{
- struct rb_node **link = &rq->lrq.tasks_timeline.rb_node;
+ struct rb_node **link = &lrq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
- struct task_struct *entry;
- s64 key = p->se.fair_key;
+ struct sched_entity *entry;
+ s64 key = p->fair_key;
int leftmost = 1;

/*
@@ -62,12 +97,12 @@
*/
while (*link) {
parent = *link;
- entry = rb_entry(parent, struct task_struct, se.run_node);
+ entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
- if ((s64)(key - entry->se.fair_key) < 0) {
+ if ((s64)(key - entry->fair_key) < 0) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
@@ -80,31 +115,31 @@
* used):
*/
if (leftmost)
- rq->lrq.rb_leftmost = &p->se.run_node;
+ lrq->rb_leftmost = &p->run_node;

- rb_link_node(&p->se.run_node, parent, link);
- rb_insert_color(&p->se.run_node, &rq->lrq.tasks_timeline);
+ rb_link_node(&p->run_node, parent, link);
+ rb_insert_color(&p->run_node, &lrq->tasks_timeline);
}

-static inline void __dequeue_task_fair(struct rq *rq, struct task_struct *p)
+static inline void __dequeue_entity(struct lrq *lrq, struct sched_entity *p)
{
- if (rq->lrq.rb_leftmost == &p->se.run_node)
- rq->lrq.rb_leftmost = NULL;
- rb_erase(&p->se.run_node, &rq->lrq.tasks_timeline);
+ if (lrq->rb_leftmost == &p->run_node)
+ lrq->rb_leftmost = NULL;
+ rb_erase(&p->run_node, &lrq->tasks_timeline);
}

-static inline struct rb_node * first_fair(struct rq *rq)
+static inline struct rb_node * first_fair(struct lrq *lrq)
{
- if (rq->lrq.rb_leftmost)
- return rq->lrq.rb_leftmost;
+ if (lrq->rb_leftmost)
+ return lrq->rb_leftmost;
/* Cache the value returned by rb_first() */
- rq->lrq.rb_leftmost = rb_first(&rq->lrq.tasks_timeline);
- return rq->lrq.rb_leftmost;
+ lrq->rb_leftmost = rb_first(&lrq->tasks_timeline);
+ return lrq->rb_leftmost;
}

-static struct task_struct * __pick_next_task_fair(struct rq *rq)
+static struct sched_entity * __pick_next_entity(struct lrq *lrq)
{
- return rb_entry(first_fair(rq), struct task_struct, se.run_node);
+ return rb_entry(first_fair(lrq), struct sched_entity, run_node);
}

/**************************************************************/
@@ -116,21 +151,21 @@
* nice level, but only linearly, not exponentially:
*/
static u64
-niced_granularity(struct task_struct *curr, unsigned long granularity)
+niced_granularity(struct sched_entity *curr, unsigned long granularity)
{
/*
* Negative nice levels get the same granularity as nice-0:
*/
- if (curr->se.load_weight >= NICE_0_LOAD)
+ if (curr->load_weight >= NICE_0_LOAD)
return granularity;
/*
* Positive nice level tasks get linearly finer
* granularity:
*/
- return curr->se.load_weight * (s64)(granularity / NICE_0_LOAD);
+ return curr->load_weight * (s64)(granularity / NICE_0_LOAD);
}

-static void limit_wait_runtime(struct rq *rq, struct task_struct *p)
+static void limit_wait_runtime(struct lrq *lrq, struct sched_entity *p)
{
s64 limit = sysctl_sched_runtime_limit;

@@ -138,30 +173,31 @@
* Niced tasks have the same history dynamic range as
* non-niced tasks:
*/
- if (p->se.wait_runtime > limit) {
- p->se.wait_runtime = limit;
- p->se.wait_runtime_overruns++;
- rq->lrq.wait_runtime_overruns++;
- }
- if (p->se.wait_runtime < -limit) {
- p->se.wait_runtime = -limit;
- p->se.wait_runtime_underruns++;
- rq->lrq.wait_runtime_underruns++;
+ if (p->wait_runtime > limit) {
+ p->wait_runtime = limit;
+ p->wait_runtime_overruns++;
+ lrq->wait_runtime_overruns++;
+ }
+ if (p->wait_runtime < -limit) {
+ p->wait_runtime = -limit;
+ p->wait_runtime_underruns++;
+ lrq->wait_runtime_underruns++;
}
}

-static void __add_wait_runtime(struct rq *rq, struct task_struct *p, s64 delta)
+static void
+__add_wait_runtime(struct lrq *lrq, struct sched_entity *p, s64 delta)
{
- p->se.wait_runtime += delta;
- p->se.sum_wait_runtime += delta;
- limit_wait_runtime(rq, p);
+ p->wait_runtime += delta;
+ p->sum_wait_runtime += delta;
+ limit_wait_runtime(lrq, p);
}

-static void add_wait_runtime(struct rq *rq, struct task_struct *p, s64 delta)
+static void add_wait_runtime(struct lrq *lrq, struct sched_entity *p, s64 delta)
{
- rq->lrq.wait_runtime -= p->se.wait_runtime;
- __add_wait_runtime(rq, p, delta);
- rq->lrq.wait_runtime += p->se.wait_runtime;
+ lrq->wait_runtime -= p->wait_runtime;
+ __add_wait_runtime(lrq, p, delta);
+ lrq->wait_runtime += p->wait_runtime;
}

static s64 div64_s(s64 divident, unsigned long divisor)
@@ -183,49 +219,51 @@
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
-static inline void update_curr(struct rq *rq, u64 now)
+static inline void update_curr(struct lrq *lrq, u64 now)
{
- unsigned long load = rq->lrq.raw_weighted_load;
+ unsigned long load = lrq->raw_weighted_load;
u64 delta_exec, delta_fair, delta_mine;
- struct task_struct *curr = rq->curr;
+ struct sched_entity *curr = lrq_curr(lrq);
+ struct rq *rq = lrq_rq(lrq);
+ struct task_struct *curtask = rq->curr;

- if (curr->sched_class != &fair_sched_class || curr == rq->idle || !load)
+ if (!curr || curtask == rq->idle || !load)
return;
/*
* Get the amount of time the current task was running
* since the last time we changed raw_weighted_load:
*/
- delta_exec = now - curr->se.exec_start;
+ delta_exec = now - curr->exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
- if (unlikely(delta_exec > curr->se.exec_max))
- curr->se.exec_max = delta_exec;
+ if (unlikely(delta_exec > curr->exec_max))
+ curr->exec_max = delta_exec;

- curr->se.sum_exec_runtime += delta_exec;
- curr->se.exec_start = now;
- rq->lrq.exec_clock += delta_exec;
+ curr->sum_exec_runtime += delta_exec;
+ curr->exec_start = now;
+ lrq->exec_clock += delta_exec;

delta_fair = delta_exec * NICE_0_LOAD;
delta_fair += load >> 1; /* rounding */
do_div(delta_fair, load);

/* Load-balancing accounting. */
- rq->lrq.delta_fair_clock += delta_fair;
- rq->lrq.delta_exec_clock += delta_exec;
+ lrq->delta_fair_clock += delta_fair;
+ lrq->delta_exec_clock += delta_exec;

/*
* Task already marked for preemption, do not burden
* it with the cost of not having left the CPU yet:
*/
if (unlikely(sysctl_sched_features & 1))
- if (unlikely(test_tsk_thread_flag(curr, TIF_NEED_RESCHED)))
+ if (unlikely(test_tsk_thread_flag(curtask, TIF_NEED_RESCHED)))
return;

- delta_mine = delta_exec * curr->se.load_weight;
+ delta_mine = delta_exec * curr->load_weight;
delta_mine += load >> 1; /* rounding */
do_div(delta_mine, load);

- rq->lrq.fair_clock += delta_fair;
+ lrq->fair_clock += delta_fair;
/*
* We executed delta_exec amount of time on the CPU,
* but we were only entitled to delta_mine amount of
@@ -233,21 +271,21 @@
* the two values are equal)
* [Note: delta_mine - delta_exec is negative]:
*/
- add_wait_runtime(rq, curr, delta_mine - delta_exec);
+ add_wait_runtime(lrq, curr, delta_mine - delta_exec);
}

static inline void
-update_stats_wait_start(struct rq *rq, struct task_struct *p, u64 now)
+update_stats_wait_start(struct lrq *lrq, struct sched_entity *p, u64 now)
{
- p->se.wait_start_fair = rq->lrq.fair_clock;
- p->se.wait_start = now;
+ p->wait_start_fair = lrq->fair_clock;
+ p->wait_start = now;
}

/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct rq *rq, struct task_struct *p, u64 now)
+update_stats_enqueue(struct lrq *lrq, struct sched_entity *p, u64 now)
{
s64 key;

@@ -255,86 +293,86 @@
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
- if (p != rq->curr)
- update_stats_wait_start(rq, p, now);
+ if (p != lrq_curr(lrq))
+ update_stats_wait_start(lrq, p, now);
/*
* Update the key:
*/
- key = rq->lrq.fair_clock;
+ key = lrq->fair_clock;

/*
* Optimize the common nice 0 case:
*/
- if (likely(p->se.load_weight == NICE_0_LOAD))
- key -= p->se.wait_runtime;
+ if (likely(p->load_weight == NICE_0_LOAD))
+ key -= p->wait_runtime;
else {
- if (p->se.wait_runtime < 0)
- key -= div64_s(p->se.wait_runtime * NICE_0_LOAD,
- p->se.load_weight);
+ if (p->wait_runtime < 0)
+ key -= div64_s(p->wait_runtime * NICE_0_LOAD,
+ p->load_weight);
else
- key -= div64_s(p->se.wait_runtime * p->se.load_weight,
+ key -= div64_s(p->wait_runtime * p->load_weight,
NICE_0_LOAD);
}

- p->se.fair_key = key;
+ p->fair_key = key;
}

/*
* Note: must be called with a freshly updated rq->fair_clock.
*/
static inline void
-update_stats_wait_end(struct rq *rq, struct task_struct *p, u64 now)
+update_stats_wait_end(struct lrq *lrq, struct sched_entity *p, u64 now)
{
s64 delta_fair, delta_wait;

- delta_wait = now - p->se.wait_start;
- if (unlikely(delta_wait > p->se.wait_max))
- p->se.wait_max = delta_wait;
+ delta_wait = now - p->wait_start;
+ if (unlikely(delta_wait > p->wait_max))
+ p->wait_max = delta_wait;

- if (p->se.wait_start_fair) {
- delta_fair = rq->lrq.fair_clock - p->se.wait_start_fair;
+ if (p->wait_start_fair) {
+ delta_fair = lrq->fair_clock - p->wait_start_fair;

- if (unlikely(p->se.load_weight != NICE_0_LOAD))
- delta_fair = div64_s(delta_fair * p->se.load_weight,
+ if (unlikely(p->load_weight != NICE_0_LOAD))
+ delta_fair = div64_s(delta_fair * p->load_weight,
NICE_0_LOAD);
- add_wait_runtime(rq, p, delta_fair);
+ add_wait_runtime(lrq, p, delta_fair);
}

- p->se.wait_start_fair = 0;
- p->se.wait_start = 0;
+ p->wait_start_fair = 0;
+ p->wait_start = 0;
}

static inline void
-update_stats_dequeue(struct rq *rq, struct task_struct *p, u64 now)
+update_stats_dequeue(struct lrq *lrq, struct sched_entity *p, u64 now)
{
- update_curr(rq, now);
+ update_curr(lrq, now);
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
- if (p != rq->curr)
- update_stats_wait_end(rq, p, now);
+ if (p != lrq_curr(lrq))
+ update_stats_wait_end(lrq, p, now);
}

/*
* We are picking a new current task - update its stats:
*/
static inline void
-update_stats_curr_start(struct rq *rq, struct task_struct *p, u64 now)
+update_stats_curr_start(struct lrq *lrq, struct sched_entity *p, u64 now)
{
/*
* We are starting a new run period:
*/
- p->se.exec_start = now;
+ p->exec_start = now;
}

/*
* We are descheduling a task - update its stats:
*/
static inline void
-update_stats_curr_end(struct rq *rq, struct task_struct *p, u64 now)
+update_stats_curr_end(struct lrq *lrq, struct sched_entity *p, u64 now)
{
- p->se.exec_start = 0;
+ p->exec_start = 0;
}

/*
@@ -347,39 +385,41 @@
* manner we move the fair clock back by a proportional
* amount of the new wait_runtime this task adds to the pool.
*/
-static void distribute_fair_add(struct rq *rq, s64 delta)
+static void distribute_fair_add(struct lrq *lrq, s64 delta)
{
- struct task_struct *curr = rq->curr;
+ struct sched_entity *curr = lrq_curr(lrq);
s64 delta_fair = 0;

if (!(sysctl_sched_features & 2))
return;

- if (rq->nr_running) {
- delta_fair = div64_s(delta, rq->nr_running);
+ if (lrq_nr_running(lrq)) {
+ delta_fair = div64_s(delta, lrq_nr_running(lrq));
/*
* The currently running task's next wait_runtime value does
* not depend on the fair_clock, so fix it up explicitly:
*/
- if (curr->sched_class == &fair_sched_class)
- add_wait_runtime(rq, curr, -delta_fair);
+ if (curr)
+ add_wait_runtime(lrq, curr, -delta_fair);
}
- rq->lrq.fair_clock -= delta_fair;
+ lrq->fair_clock -= delta_fair;
}

/**************************************************************/
/* Scheduling class queueing methods:
*/

-static void enqueue_sleeper(struct rq *rq, struct task_struct *p)
+static void enqueue_sleeper(struct lrq *lrq, struct sched_entity *p)
{
- unsigned long load = rq->lrq.raw_weighted_load;
+ unsigned long load = lrq->raw_weighted_load;
s64 delta_fair, prev_runtime;
+ struct task_struct *tsk = entity_to_task(p);

- if (p->policy == SCHED_BATCH || !(sysctl_sched_features & 4))
+ if ((entity_is_task(p) && tsk->policy == SCHED_BATCH) ||
+ !(sysctl_sched_features & 4))
goto out;

- delta_fair = rq->lrq.fair_clock - p->se.sleep_start_fair;
+ delta_fair = lrq->fair_clock - p->sleep_start_fair;

/*
* Fix up delta_fair with the effect of us running
@@ -387,69 +427,206 @@
*/
if (!(sysctl_sched_features & 32))
delta_fair = div64_s(delta_fair * load,
- load + p->se.load_weight);
- delta_fair = div64_s(delta_fair * p->se.load_weight, NICE_0_LOAD);
+ load + p->load_weight);
+ delta_fair = div64_s(delta_fair * p->load_weight, NICE_0_LOAD);

- prev_runtime = p->se.wait_runtime;
- __add_wait_runtime(rq, p, delta_fair);
- delta_fair = p->se.wait_runtime - prev_runtime;
+ prev_runtime = p->wait_runtime;
+ __add_wait_runtime(lrq, p, delta_fair);
+ delta_fair = p->wait_runtime - prev_runtime;

/*
* We move the fair clock back by a load-proportional
* amount of the new wait_runtime this task adds to
* the 'pool':
*/
- distribute_fair_add(rq, delta_fair);
+ distribute_fair_add(lrq, delta_fair);

out:
- rq->lrq.wait_runtime += p->se.wait_runtime;
+ lrq->wait_runtime += p->wait_runtime;

- p->se.sleep_start_fair = 0;
+ p->sleep_start_fair = 0;
}

-/*
- * The enqueue_task method is called before nr_running is
- * increased. Here we update the fair scheduling stats and
- * then put the task into the rbtree:
- */
static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+enqueue_entity(struct lrq *lrq, struct sched_entity *p, int wakeup, u64 now)
{
u64 delta = 0;

/*
* Update the fair clock.
*/
- update_curr(rq, now);
+ update_curr(lrq, now);

if (wakeup) {
- if (p->se.sleep_start) {
- delta = now - p->se.sleep_start;
+ if (p->sleep_start) {
+ delta = now - p->sleep_start;
if ((s64)delta < 0)
delta = 0;

- if (unlikely(delta > p->se.sleep_max))
- p->se.sleep_max = delta;
+ if (unlikely(delta > p->sleep_max))
+ p->sleep_max = delta;

- p->se.sleep_start = 0;
+ p->sleep_start = 0;
}
- if (p->se.block_start) {
- delta = now - p->se.block_start;
+ if (p->block_start) {
+ delta = now - p->block_start;
if ((s64)delta < 0)
delta = 0;

- if (unlikely(delta > p->se.block_max))
- p->se.block_max = delta;
+ if (unlikely(delta > p->block_max))
+ p->block_max = delta;

- p->se.block_start = 0;
+ p->block_start = 0;
}
- p->se.sum_sleep_runtime += delta;
+ p->sum_sleep_runtime += delta;

- if (p->se.sleep_start_fair)
- enqueue_sleeper(rq, p);
+ if (p->sleep_start_fair)
+ enqueue_sleeper(lrq, p);
}
- update_stats_enqueue(rq, p, now);
- __enqueue_task_fair(rq, p);
+ update_stats_enqueue(lrq, p, now);
+ __enqueue_entity(lrq, p);
+}
+
+static void
+dequeue_entity(struct lrq *lrq, struct sched_entity *p, int sleep, u64 now)
+{
+ update_stats_dequeue(lrq, p, now);
+ if (sleep) {
+ if (entity_is_task(p)) {
+ struct task_struct *tsk = entity_to_task(p);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ p->sleep_start = now;
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ p->block_start = now;
+ }
+ p->sleep_start_fair = lrq->fair_clock;
+ lrq->wait_runtime -= p->wait_runtime;
+ }
+ __dequeue_entity(lrq, p);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static inline void
+__check_preempt_curr_fair(struct lrq *lrq, struct sched_entity *p,
+ struct sched_entity *curr, unsigned long granularity)
+{
+ s64 __delta = curr->fair_key - p->fair_key;
+
+ /*
+ * Take scheduling granularity into account - do not
+ * preempt the current task unless the best task has
+ * a larger than sched_granularity fairness advantage:
+ */
+ if (__delta > niced_granularity(curr, granularity))
+ resched_task(lrq_rq(lrq)->curr);
+}
+
+static struct sched_entity * pick_next_entity(struct lrq *lrq, u64 now)
+{
+ struct sched_entity *p = __pick_next_entity(lrq);
+
+ /*
+ * Any task has to be enqueued before it get to execute on
+ * a CPU. So account for the time it spent waiting on the
+ * runqueue. (note, here we rely on pick_next_task() having
+ * done a put_prev_task_fair() shortly before this, which
+ * updated rq->fair_clock - used by update_stats_wait_end())
+ */
+ update_stats_wait_end(lrq, p, now);
+ update_stats_curr_start(lrq, p, now);
+
+ return p;
+}
+
+static void put_prev_entity(struct lrq *lrq, struct sched_entity *prev, u64 now)
+{
+ /*
+ * If the task is still waiting for the CPU (it just got
+ * preempted), update its position within the tree and
+ * start the wait period:
+ */
+ if ((sysctl_sched_features & 16) && entity_is_task(prev)) {
+ struct task_struct *prevtask = entity_to_task(prev);
+
+ if (prev->on_rq &&
+ test_tsk_thread_flag(prevtask, TIF_NEED_RESCHED)) {
+
+ dequeue_entity(lrq, prev, 0, now);
+ prev->on_rq = 0;
+ enqueue_entity(lrq, prev, 0, now);
+ prev->on_rq = 1;
+ } else
+ update_curr(lrq, now);
+ } else {
+ update_curr(lrq, now);
+ }
+
+ update_stats_curr_end(lrq, prev, now);
+
+ if (prev->on_rq)
+ update_stats_wait_start(lrq, prev, now);
+}
+
+static void entity_tick(struct lrq *lrq, struct sched_entity *curr)
+{
+ struct sched_entity *next;
+ struct rq *rq = lrq_rq(lrq);
+ u64 now = __rq_clock(rq);
+
+ /*
+ * Dequeue and enqueue the task to update its
+ * position within the tree:
+ */
+ dequeue_entity(lrq, curr, 0, now);
+ curr->on_rq = 0;
+ enqueue_entity(lrq, curr, 0, now);
+ curr->on_rq = 1;
+
+ /*
+ * Reschedule if another task tops the current one.
+ */
+ next = __pick_next_entity(lrq);
+ if (next == curr)
+ return;
+
+ if (entity_is_task(curr)) {
+ struct task_struct *curtask = entity_to_task(curr),
+ *nexttask = entity_to_task(next);
+
+ if ((curtask == rq->idle) || (rt_prio(nexttask->prio) &&
+ (nexttask->prio < curtask->prio))) {
+ resched_task(curtask);
+ return;
+ }
+ }
+ __check_preempt_curr_fair(lrq, next, curr, sysctl_sched_granularity);
+}
+
+
+/******************************************************************************/
+/* BEGIN : CFS operations on tasks */
+/******************************************************************************/
+
+static inline struct lrq *task_lrq(struct task_struct *p)
+{
+ return &task_rq(p)->lrq;
+}
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+{
+ struct lrq *lrq = task_lrq(p);
+ struct sched_entity *se = &p->se;
+
+ enqueue_entity(lrq, se, wakeup, now);
}

/*
@@ -460,16 +637,10 @@
static void
dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
{
- update_stats_dequeue(rq, p, now);
- if (sleep) {
- if (p->state & TASK_INTERRUPTIBLE)
- p->se.sleep_start = now;
- if (p->state & TASK_UNINTERRUPTIBLE)
- p->se.block_start = now;
- p->se.sleep_start_fair = rq->lrq.fair_clock;
- rq->lrq.wait_runtime -= p->se.wait_runtime;
- }
- __dequeue_task_fair(rq, p);
+ struct lrq *lrq = task_lrq(p);
+ struct sched_entity *se = &p->se;
+
+ dequeue_entity(lrq, se, sleep, now);
}

/*
@@ -482,16 +653,18 @@
{
struct task_struct *p_next;
u64 now;
+ struct lrq *lrq = task_lrq(p);
+ struct sched_entity *se = &p->se;

now = __rq_clock(rq);
/*
* Dequeue and enqueue the task to update its
* position within the tree:
*/
- dequeue_task_fair(rq, p, 0, now);
- p->se.on_rq = 0;
- enqueue_task_fair(rq, p, 0, now);
- p->se.on_rq = 1;
+ dequeue_entity(lrq, se, 0, now);
+ se->on_rq = 0;
+ enqueue_entity(lrq, se, 0, now);
+ se->on_rq = 1;

/*
* yield-to support: if we are on the same runqueue then
@@ -503,35 +676,19 @@

s64 delta = p->se.wait_runtime >> 1;

- __add_wait_runtime(rq, p_to, delta);
- __add_wait_runtime(rq, p, -delta);
+ __add_wait_runtime(lrq, &p_to->se, delta);
+ __add_wait_runtime(lrq, &p->se, -delta);
}

/*
* Reschedule if another task tops the current one.
*/
- p_next = __pick_next_task_fair(rq);
+ se = __pick_next_entity(lrq);
+ p_next = entity_to_task(se);
if (p_next != p)
resched_task(p);
}

-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static inline void
-__check_preempt_curr_fair(struct rq *rq, struct task_struct *p,
- struct task_struct *curr, unsigned long granularity)
-{
- s64 __delta = curr->se.fair_key - p->se.fair_key;
-
- /*
- * Take scheduling granularity into account - do not
- * preempt the current task unless the best task has
- * a larger than sched_granularity fairness advantage:
- */
- if (__delta > niced_granularity(curr, granularity))
- resched_task(curr);
-}

/*
* Preempt the current task with a newly woken task if needed:
@@ -539,12 +696,13 @@
static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
{
struct task_struct *curr = rq->curr;
+ struct lrq *lrq = task_lrq(curr);
unsigned long granularity;

if ((curr == rq->idle) || rt_prio(p->prio)) {
if (sysctl_sched_features & 8) {
if (rt_prio(p->prio))
- update_curr(rq, rq_clock(rq));
+ update_curr(lrq, rq_clock(rq));
}
resched_task(curr);
} else {
@@ -555,25 +713,18 @@
if (unlikely(p->policy == SCHED_BATCH))
granularity = sysctl_sched_batch_wakeup_granularity;

- __check_preempt_curr_fair(rq, p, curr, granularity);
+ __check_preempt_curr_fair(lrq, &p->se, &curr->se, granularity);
}
}

static struct task_struct * pick_next_task_fair(struct rq *rq, u64 now)
{
- struct task_struct *p = __pick_next_task_fair(rq);
+ struct lrq *lrq = &rq->lrq;
+ struct sched_entity *se;

- /*
- * Any task has to be enqueued before it get to execute on
- * a CPU. So account for the time it spent waiting on the
- * runqueue. (note, here we rely on pick_next_task() having
- * done a put_prev_task_fair() shortly before this, which
- * updated rq->fair_clock - used by update_stats_wait_end())
- */
- update_stats_wait_end(rq, p, now);
- update_stats_curr_start(rq, p, now);
+ se = pick_next_entity(lrq, now);

- return p;
+ return entity_to_task(se);
}

/*
@@ -581,32 +732,13 @@
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
{
+ struct lrq *lrq = task_lrq(prev);
+ struct sched_entity *se = &prev->se;
+
if (prev == rq->idle)
return;

- /*
- * If the task is still waiting for the CPU (it just got
- * preempted), update its position within the tree and
- * start the wait period:
- */
- if (sysctl_sched_features & 16) {
- if (prev->se.on_rq &&
- test_tsk_thread_flag(prev, TIF_NEED_RESCHED)) {
-
- dequeue_task_fair(rq, prev, 0, now);
- prev->se.on_rq = 0;
- enqueue_task_fair(rq, prev, 0, now);
- prev->se.on_rq = 1;
- } else
- update_curr(rq, now);
- } else {
- update_curr(rq, now);
- }
-
- update_stats_curr_end(rq, prev, now);
-
- if (prev->se.on_rq)
- update_stats_wait_start(rq, prev, now);
+ put_prev_entity(lrq, se, now);
}

/**************************************************************/
@@ -636,7 +768,7 @@

static struct task_struct * load_balance_start_fair(struct rq *rq)
{
- return __load_balance_iterator(rq, first_fair(rq));
+ return __load_balance_iterator(rq, first_fair(&rq->lrq));
}

static struct task_struct * load_balance_next_fair(struct rq *rq)
@@ -649,31 +781,10 @@
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr)
{
- struct task_struct *next;
- u64 now = __rq_clock(rq);
-
- /*
- * Dequeue and enqueue the task to update its
- * position within the tree:
- */
- dequeue_task_fair(rq, curr, 0, now);
- curr->se.on_rq = 0;
- enqueue_task_fair(rq, curr, 0, now);
- curr->se.on_rq = 1;
-
- /*
- * Reschedule if another task tops the current one.
- */
- next = __pick_next_task_fair(rq);
- if (next == curr)
- return;
+ struct lrq *lrq = task_lrq(curr);
+ struct sched_entity *se = &curr->se;

- if ((curr == rq->idle) || (rt_prio(next->prio) &&
- (next->prio < curr->prio)))
- resched_task(curr);
- else
- __check_preempt_curr_fair(rq, next, curr,
- sysctl_sched_granularity);
+ entity_tick(lrq, se);
}

/*
@@ -685,14 +796,17 @@
*/
static void task_new_fair(struct rq *rq, struct task_struct *p)
{
+ struct lrq *lrq = task_lrq(p);
+ struct sched_entity *se = &p->se;
+
sched_info_queued(p);
- update_stats_enqueue(rq, p, rq_clock(rq));
+ update_stats_enqueue(lrq, se, rq_clock(rq));
/*
* Child runs first: we let it run before the parent
* until it reschedules once. We set up the key so that
* it will preempt the parent:
*/
- p->se.fair_key = current->se.fair_key - niced_granularity(rq->curr,
+ p->se.fair_key = current->se.fair_key - niced_granularity(&rq->curr->se,
sysctl_sched_granularity) - 1;
/*
* The first wait is dominated by the child-runs-first logic,
@@ -706,7 +820,7 @@
*/
// p->se.wait_runtime = -(s64)(sysctl_sched_granularity / 2);

- __enqueue_task_fair(rq, p);
+ __enqueue_entity(lrq, se);
p->se.on_rq = 1;
inc_nr_running(p, rq);
}
Index: current/kernel/sched_debug.c
===================================================================
--- current.orig/kernel/sched_debug.c 2007-06-09 15:07:16.000000000 +0530
+++ current/kernel/sched_debug.c 2007-06-09 15:07:33.000000000 +0530
@@ -87,7 +87,7 @@
unsigned long flags;

spin_lock_irqsave(&rq->lock, flags);
- curr = first_fair(rq);
+ curr = first_fair(&rq->lrq);
while (curr) {
p = rb_entry(curr, struct task_struct, se.run_node);
wait_runtime_rq_sum += p->se.wait_runtime;
--
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Xavier Bestel: "Re: A kexec approach to hibernation"
Previous message: Srivatsa Vaddagiri: "[RFC][PATCH 2/6] task's cpu information needs to be always correct"
In reply to: Balbir Singh: "Re: [ckrm-tech] [RFC][PATCH 2/6] task's cpu information needs tobe always correct"
Next in thread: Balbir Singh: "Re: [RFC][PATCH 3/6] core changes in CFS"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]