[ RFC, PATCH 1/5 ] CPU controller - base changes

From: Srivatsa Vaddagiri
Date: Fri Aug 04 2006 - 01:03:04 EST



This patch splits the single main runqueue into several runqueues on each CPU.
One (main) runqueue is used to hold task-groups and one runqueue for each
task-group in the system.

Signed-off-by : Srivatsa Vaddagiri <vatsa@xxxxxxxxxx>


init/Kconfig | 8 +
kernel/sched.c | 251 +++++++++++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 228 insertions(+), 31 deletions(-)

diff -puN kernel/sched.c~cpu_ctlr_base_changes kernel/sched.c
--- linux-2.6.18-rc3/kernel/sched.c~cpu_ctlr_base_changes 2006-08-04 07:56:46.000000000 +0530
+++ linux-2.6.18-rc3-root/kernel/sched.c 2006-08-04 07:56:54.000000000 +0530
@@ -191,10 +191,48 @@ static inline unsigned int task_timeslic

struct prio_array {
unsigned int nr_active;
+ int best_static_prio, best_dyn_prio;
DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_PRIO];
};

+/*
+ * Each task belong to some task-group. Task-groups and what tasks they contain
+ * are defined by the administrator using some suitable interface.
+ * Administrator can also define the CPU bandwidth provided to each task-group.
+ *
+ * Task-groups are given a certain priority to run on every CPU. Currently
+ * task-group priority on a CPU is defined to be the same as that of
+ * highest-priority runnable task it has on that CPU. Task-groups also
+ * get their own runqueue on every CPU. The main runqueue on each CPU is
+ * used to hold task-groups, rather than tasks.
+ *
+ * Scheduling decision on a CPU is now two-step : first pick highest priority
+ * task-group from the main runqueue and next pick highest priority task from
+ * the runqueue of that group. Both decisions are of O(1) complexity.
+ */
+
+/* runqueue used for every task-group */
+struct task_grp_rq {
+ struct prio_array arrays[2];
+ struct prio_array *active, *expired, *rq_array;
+ unsigned long expired_timestamp;
+ unsigned int ticks;
+ int prio; /* Priority of the task-group */
+ struct list_head list;
+};
+
+static DEFINE_PER_CPU(struct task_grp_rq, init_tg_rq);
+
+/* task-group object - maintains information about each task-group */
+struct task_grp {
+ int ticks; /* bandwidth given to the task-group */
+ struct task_grp_rq *rq[NR_CPUS]; /* runqueue pointer for every cpu */
+};
+
+/* The "default" task-group */
+struct task_grp init_task_grp;
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -224,12 +262,11 @@ struct rq {
*/
unsigned long nr_uninterruptible;

- unsigned long expired_timestamp;
unsigned long long timestamp_last_tick;
struct task_struct *curr, *idle;
struct mm_struct *prev_mm;
+ /* these arrays hold task-groups */
struct prio_array *active, *expired, arrays[2];
- int best_expired_prio;
atomic_t nr_iowait;

#ifdef CONFIG_SMP
@@ -244,6 +281,7 @@ struct rq {
#endif

#ifdef CONFIG_SCHEDSTATS
+ /* xxx: move these to task-group runqueue */
/* latency stats */
struct sched_info rq_sched_info;

@@ -657,6 +695,63 @@ sched_info_switch(struct task_struct *pr
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */

+static unsigned int task_grp_timeslice(struct task_grp *tg)
+{
+ /* xxx: take into account sleep_avg etc of the group */
+ return tg->ticks;
+}
+
+/* Dequeue task-group object from the main runqueue */
+static void dequeue_task_grp(struct task_grp_rq *tgrq)
+{
+ struct prio_array *array = tgrq->rq_array;
+
+ array->nr_active--;
+ list_del(&tgrq->list);
+ if (list_empty(array->queue + tgrq->prio))
+ __clear_bit(tgrq->prio, array->bitmap);
+ tgrq->rq_array = NULL;
+}
+
+/* Enqueue task-group object on the main runqueue */
+static void enqueue_task_grp(struct task_grp_rq *tgrq, struct prio_array *array,
+ int head)
+{
+ if (!head)
+ list_add_tail(&tgrq->list, array->queue + tgrq->prio);
+ else
+ list_add(&tgrq->list, array->queue + tgrq->prio);
+ __set_bit(tgrq->prio, array->bitmap);
+ array->nr_active++;
+ tgrq->rq_array = array;
+}
+
+/* return the task-group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+ return &init_task_grp;
+}
+
+static inline void update_task_grp_prio(struct task_grp_rq *tgrq, struct rq *rq,
+ int head)
+{
+ int new_prio;
+ struct prio_array *array = tgrq->rq_array;
+
+ new_prio = tgrq->active->best_dyn_prio;
+ if (new_prio == MAX_PRIO)
+ new_prio = tgrq->expired->best_dyn_prio;
+
+ if (array)
+ dequeue_task_grp(tgrq);
+ tgrq->prio = new_prio;
+ if (new_prio != MAX_PRIO) {
+ if (!array)
+ array = rq->active;
+ enqueue_task_grp(tgrq, array, head);
+ }
+}
+
/*
* Adding/removing a task to/from a priority array:
*/
@@ -664,8 +759,17 @@ static void dequeue_task(struct task_str
{
array->nr_active--;
list_del(&p->run_list);
- if (list_empty(array->queue + p->prio))
+ if (list_empty(array->queue + p->prio)) {
__clear_bit(p->prio, array->bitmap);
+ if (p->prio == array->best_dyn_prio) {
+ struct task_grp_rq *tgrq = task_grp(p)->rq[task_cpu(p)];
+
+ array->best_dyn_prio =
+ sched_find_first_bit(array->bitmap);
+ if (array == tgrq->active || !tgrq->active->nr_active)
+ update_task_grp_prio(tgrq, task_rq(p), 0);
+ }
+ }
}

static void enqueue_task(struct task_struct *p, struct prio_array *array)
@@ -675,6 +779,14 @@ static void enqueue_task(struct task_str
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+
+ if (p->prio < array->best_dyn_prio) {
+ struct task_grp_rq *tgrq = task_grp(p)->rq[task_cpu(p)];
+
+ array->best_dyn_prio = p->prio;
+ if (array == tgrq->active || !tgrq->active->nr_active)
+ update_task_grp_prio(tgrq, task_rq(p), 0);
+ }
}

/*
@@ -693,6 +805,14 @@ enqueue_task_head(struct task_struct *p,
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+
+ if (p->prio < array->best_dyn_prio) {
+ struct task_grp_rq *tgrq = task_grp(p)->rq[task_cpu(p)];
+
+ array->best_dyn_prio = p->prio;
+ if (array == tgrq->active || !tgrq->active->nr_active)
+ update_task_grp_prio(tgrq, task_rq(p), 1);
+ }
}

/*
@@ -831,10 +951,11 @@ static int effective_prio(struct task_st
*/
static void __activate_task(struct task_struct *p, struct rq *rq)
{
- struct prio_array *target = rq->active;
+ struct task_grp_rq *tgrq = task_grp(p)->rq[task_cpu(p)];
+ struct prio_array *target = tgrq->active;

if (batch_task(p))
- target = rq->expired;
+ target = tgrq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
}
@@ -844,7 +965,10 @@ static void __activate_task(struct task_
*/
static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
{
- enqueue_task_head(p, rq->active);
+ struct task_grp_rq *tgrq = task_grp(p)->rq[task_cpu(p)];
+ struct prio_array *target = tgrq->active;
+
+ enqueue_task_head(p, target);
inc_nr_running(p, rq);
}

@@ -2077,7 +2201,7 @@ int can_migrate_task(struct task_struct
return 1;
}

-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->expired->best_static_prio)

/*
* move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
@@ -2884,6 +3008,8 @@ unsigned long long current_sched_time(co
return ns;
}

+#define nr_tasks(tgrq) (tgrq->active->nr_active + tgrq->expired->nr_active)
+
/*
* We place interactive tasks back into the active array, if possible.
*
@@ -2894,13 +3020,13 @@ unsigned long long current_sched_time(co
* increasing number of running tasks. We also ignore the interactivity
* if a better static_prio task has expired:
*/
-static inline int expired_starving(struct rq *rq)
+static inline int expired_starving(struct task_grp_rq *rq)
{
- if (rq->curr->static_prio > rq->best_expired_prio)
+ if (current->static_prio > rq->expired->best_static_prio)
return 1;
if (!STARVATION_LIMIT || !rq->expired_timestamp)
return 0;
- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+ if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * nr_tasks(rq))
return 1;
return 0;
}
@@ -2991,6 +3117,7 @@ void scheduler_tick(void)
struct task_struct *p = current;
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
+ struct task_grp_rq *tgrq = task_grp(p)->rq[cpu];

update_cpu_clock(p, rq, now);

@@ -3004,7 +3131,7 @@ void scheduler_tick(void)
}

/* Task might have expired already, but not scheduled off yet */
- if (p->array != rq->active) {
+ if (p->array != tgrq->active) {
set_tsk_need_resched(p);
goto out;
}
@@ -3027,25 +3154,26 @@ void scheduler_tick(void)
set_tsk_need_resched(p);

/* put it at the end of the queue: */
- requeue_task(p, rq->active);
+ requeue_task(p, tgrq->active);
}
goto out_unlock;
}
if (!--p->time_slice) {
- dequeue_task(p, rq->active);
+ dequeue_task(p, tgrq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;

- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
- enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
+ if (!tgrq->expired_timestamp)
+ tgrq->expired_timestamp = jiffies;
+ if (!TASK_INTERACTIVE(p) || expired_starving(tgrq)) {
+ enqueue_task(p, tgrq->expired);
+ if (p->static_prio < tgrq->expired->best_static_prio)
+ tgrq->expired->best_static_prio =
+ p->static_prio;
} else
- enqueue_task(p, rq->active);
+ enqueue_task(p, tgrq->active);
} else {
/*
* Prevent a too long timeslice allowing a task to monopolize
@@ -3066,12 +3194,21 @@ void scheduler_tick(void)
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
+ (p->array == tgrq->active)) {

- requeue_task(p, rq->active);
+ requeue_task(p, tgrq->active);
set_tsk_need_resched(p);
}
}
+
+ if (!--tgrq->ticks) {
+ /* Move the task group to expired list */
+ dequeue_task_grp(tgrq);
+ tgrq->ticks = task_grp_timeslice(task_grp(p));
+ enqueue_task_grp(tgrq, rq->expired, 0);
+ set_tsk_need_resched(p);
+ }
+
out_unlock:
spin_unlock(&rq->lock);
out:
@@ -3264,6 +3401,7 @@ asmlinkage void __sched schedule(void)
int cpu, idx, new_prio;
long *switch_count;
struct rq *rq;
+ struct task_grp_rq *next_grp;

/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -3332,23 +3470,41 @@ need_resched_nonpreemptible:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
- rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu);
goto switch_tasks;
}
}

+ /* Pick a task group first */
+#ifdef CONFIG_CPUMETER
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
- schedstat_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
+ }
+ idx = sched_find_first_bit(array->bitmap);
+ queue = array->queue + idx;
+ next_grp = list_entry(queue->next, struct task_grp_rq, list);
+#else
+ next_grp = init_task_grp.rq[cpu];
+#endif
+
+ /* Pick a task within that group next */
+ array = next_grp->active;
+ if (!array->nr_active) {
+ /*
+ * Switch the active and expired arrays.
+ */
+ schedstat_inc(next_grp, sched_switch);
+ next_grp->active = next_grp->expired;
+ next_grp->expired = array;
+ array = next_grp->active;
+ next_grp->expired_timestamp = 0;
+ next_grp->expired->best_static_prio = MAX_PRIO;
}

idx = sched_find_first_bit(array->bitmap);
@@ -4413,7 +4569,8 @@ asmlinkage long sys_sched_getaffinity(pi
asmlinkage long sys_sched_yield(void)
{
struct rq *rq = this_rq_lock();
- struct prio_array *array = current->array, *target = rq->expired;
+ struct task_grp_rq *tgrq = task_grp(current)->rq[task_cpu(current)];
+ struct prio_array *array = current->array, *target = tgrq->expired;

schedstat_inc(rq, yld_cnt);
/*
@@ -4424,13 +4581,13 @@ asmlinkage long sys_sched_yield(void)
* array.)
*/
if (rt_task(current))
- target = rq->active;
+ target = tgrq->active;

if (array->nr_active == 1) {
schedstat_inc(rq, yld_act_empty);
- if (!rq->expired->nr_active)
+ if (!tgrq->expired->nr_active)
schedstat_inc(rq, yld_both_empty);
- } else if (!rq->expired->nr_active)
+ } else if (!tgrq->expired->nr_active)
schedstat_inc(rq, yld_exp_empty);

if (array != target) {
@@ -6722,21 +6879,53 @@ int in_sched_functions(unsigned long add
&& addr < (unsigned long)__sched_text_end);
}

+static void task_grp_rq_init(struct task_grp_rq *tgrq, int ticks)
+{
+ int j, k;
+
+ tgrq->active = tgrq->arrays;
+ tgrq->expired = tgrq->arrays + 1;
+ tgrq->rq_array = NULL;
+ tgrq->expired->best_static_prio = MAX_PRIO;
+ tgrq->expired->best_dyn_prio = MAX_PRIO;
+ tgrq->active->best_static_prio = MAX_PRIO;
+ tgrq->active->best_dyn_prio = MAX_PRIO;
+ tgrq->prio = MAX_PRIO;
+ tgrq->ticks = ticks;
+ INIT_LIST_HEAD(&tgrq->list);
+
+ for (j = 0; j < 2; j++) {
+ struct prio_array *array;
+
+ array = tgrq->arrays + j;
+ for (k = 0; k < MAX_PRIO; k++) {
+ INIT_LIST_HEAD(array->queue + k);
+ __clear_bit(k, array->bitmap);
+ }
+ // delimiter for bitsearch
+ __set_bit(MAX_PRIO, array->bitmap);
+ }
+}
+
void __init sched_init(void)
{
int i, j, k;

+ init_task_grp.ticks = -1; /* Unlimited bandwidth */
+
for_each_possible_cpu(i) {
struct prio_array *array;
struct rq *rq;
+ struct task_grp_rq *tgrq;

rq = cpu_rq(i);
+ tgrq = init_task_grp.rq[i] = &per_cpu(init_tg_rq, i);
spin_lock_init(&rq->lock);
+ task_grp_rq_init(tgrq, init_task_grp.ticks);
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
- rq->best_expired_prio = MAX_PRIO;

#ifdef CONFIG_SMP
rq->sd = NULL;
diff -puN init/Kconfig~cpu_ctlr_base_changes init/Kconfig
--- linux-2.6.18-rc3/init/Kconfig~cpu_ctlr_base_changes 2006-08-04 07:56:46.000000000 +0530
+++ linux-2.6.18-rc3-root/init/Kconfig 2006-08-04 07:56:54.000000000 +0530
@@ -248,6 +248,14 @@ config CPUSETS

Say N if unsure.

+config CPUMETER
+ bool "CPU resource control"
+ depends on CPUSETS
+ help
+ This options lets you create cpu resource partitions within
+ cpusets. Each resource partition can be given a different quota
+ of CPU usage.
+
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help

_
--
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/