> [snip] The patch should also cleanly apply to 2.6.0-test1-bk2.
Actually, it does not, due to the cpumask_t -> unsigned long change. This one
applies.
Rudo.
--- linux/arch/i386/kernel/timers/timer_tsc.c.orig
+++ linux/arch/i386/kernel/timers/timer_tsc.c
@@ -116,6 +117,24 @@ static unsigned long long monotonic_cloc
return base + cycles_2_ns(this_offset - last_offset);
}
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long sched_clock(void)
+{
+ unsigned long long this_offset;
+
+ if (unlikely(!cpu_has_tsc))
+ return (unsigned long long)jiffies * (1000000000 / HZ);
+
+ /* Read the Time Stamp Counter */
+ rdtscll(this_offset);
+
+ /* return the value in ns */
+ return cycles_2_ns(this_offset);
+}
+
+
static void mark_offset_tsc(void)
{
unsigned long lost,delay;
--- linux/arch/i386/kernel/smpboot.c.orig
+++ linux/arch/i386/kernel/smpboot.c
@@ -915,13 +915,13 @@ static void smp_tune_scheduling (void)
cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
}
- cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
+ cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1;
printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
(long)cacheflush_time/(cpu_khz/1000),
((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
printk("task migration cache decay timeout: %ld msecs.\n",
- (cache_decay_ticks + 1) * 1000 / HZ);
+ cache_decay_ticks);
}
/*
--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -341,7 +341,8 @@ struct task_struct {
prio_array_t *array;
unsigned long sleep_avg;
- unsigned long last_run;
+ unsigned long long timestamp;
+ int activated;
unsigned long policy;
unsigned long cpus_allowed;
@@ -511,6 +512,8 @@ static inline int set_cpus_allowed(task_
}
#endif
+extern unsigned long long sched_clock(void);
+
#ifdef CONFIG_NUMA
extern void sched_balance_exec(void);
extern void node_nr_running_init(void);
--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -924,7 +924,7 @@ struct task_struct *copy_process(unsigne
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->last_run = jiffies;
+ p->timestamp = sched_clock();
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -68,12 +68,13 @@
*/
#define MIN_TIMESLICE ( 10 * HZ / 1000)
#define MAX_TIMESLICE (200 * HZ / 1000)
+#define TIMESLICE_GRANULARITY (HZ/20 ?: 1)
#define CHILD_PENALTY 50
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (10*HZ)
+#define MAX_SLEEP_AVG (2*1000000000)
#define STARVATION_LIMIT (10*HZ)
#define NODE_THRESHOLD 125
@@ -115,6 +116,11 @@
#define TASK_INTERACTIVE(p) \
((p)->prio <= (p)->static_prio - DELTA(p))
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ((p)->prio < (rq)->curr->prio || \
+ ((p)->prio == (rq)->curr->prio && \
+ (p)->time_slice > (rq)->curr->time_slice * 2))
+
/*
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
* to time slice values.
@@ -319,8 +325,8 @@ static int effective_prio(task_t *p)
if (rt_task(p))
return p->prio;
- bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
- MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100;
+ bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
@@ -339,24 +345,24 @@ static inline void __activate_task(task_
nr_running_inc(rq);
}
-/*
- * activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
- */
-static inline void activate_task(task_t *p, runqueue_t *rq)
+static void recalc_task_prio(task_t *p, unsigned long long now)
{
- long sleep_time = jiffies - p->last_run - 1;
+ unsigned long long __sleep_time = now - p->timestamp;
+ unsigned long sleep_time;
+
+ if (__sleep_time > MAX_SLEEP_AVG)
+ sleep_time = MAX_SLEEP_AVG;
+ else
+ sleep_time = (unsigned long)__sleep_time;
if (sleep_time > 0) {
- int sleep_avg;
+ unsigned long long sleep_avg;
/*
* This code gives a bonus to interactive tasks.
*
* The boost works by updating the 'average sleep time'
- * value here, based on ->last_run. The more time a task
+ * value here, based on ->timestamp. The more time a task
* spends sleeping, the higher the average gets - and the
* higher the priority boost gets as well.
*/
@@ -375,6 +381,23 @@ static inline void activate_task(task_t
p->prio = effective_prio(p);
}
}
+}
+
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static inline void activate_task(task_t *p, runqueue_t *rq)
+{
+ unsigned long long now = sched_clock();
+
+ recalc_task_prio(p, now);
+
+ p->activated = 1;
+ p->timestamp = now;
+
__activate_task(p, rq);
}
@@ -501,7 +524,7 @@ repeat_lock_task:
__activate_task(p, rq);
else {
activate_task(p, rq);
- if (p->prio < rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
@@ -550,8 +573,8 @@ void wake_up_forked_process(task_t * p)
* and children as well, to keep max-interactive tasks
* from forking tasks that are max-interactive.
*/
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+ current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY;
+ p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY;
p->prio = effective_prio(p);
set_task_cpu(p, smp_processor_id());
@@ -592,8 +615,7 @@ void sched_exit(task_t * p)
* the sleep_avg of the parent as well.
*/
if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
- p->sleep_avg) / (EXIT_WEIGHT + 1);
+ p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1);
}
/**
@@ -978,13 +1000,8 @@ static inline void pull_task(runqueue_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (p->prio < this_rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
- else {
- if (p->prio == this_rq->curr->prio &&
- p->time_slice > this_rq->curr->time_slice)
- set_need_resched();
- }
}
/*
@@ -1001,12 +1018,14 @@ static void load_balance(runqueue_t *thi
runqueue_t *busiest;
prio_array_t *array;
struct list_head *head, *curr;
+ unsigned long long now;
task_t *tmp;
busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
if (!busiest)
goto out;
+ now = sched_clock();
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
@@ -1052,8 +1071,9 @@ skip_queue:
* 3) are cache-hot on their current CPU.
*/
+#warning fixme
#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
+ ((!idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))
@@ -1213,14 +1233,11 @@ void scheduler_tick(int user_ticks, int
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice. This makes it possible for interactive tasks
+ * to use up their timeslices at their highest priority levels.
*/
- if (p->sleep_avg)
- p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
@@ -1244,12 +1261,33 @@ void scheduler_tick(int user_ticks, int
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+ if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq) || 0) {
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
enqueue_task(p, rq->expired);
} else
enqueue_task(p, rq->active);
+ } else {
+ /*
+ * Prevent a too long timeslice allowing a task to monopolize
+ * the CPU. We do this by splitting up the timeslice into
+ * smaller pieces.
+ *
+ * Note: this does not mean the task's timeslices expire or
+ * get lost in any way, they just might be preempted by
+ * another task of equal priority. (one with higher
+ * priority would have preempted this task already.) We
+ * requeue this task to the end of the list on this priority
+ * level, which is in essence a round-robin of tasks with
+ * equal priority.
+ */
+ if (!(p->time_slice % TIMESLICE_GRANULARITY) &&
+ (p->array == rq->active)) {
+ dequeue_task(p, rq->active);
+ set_tsk_need_resched(p);
+ p->prio = effective_prio(p);
+ enqueue_task(p, rq->active);
+ }
}
out_unlock:
spin_unlock(&rq->lock);
@@ -1268,6 +1306,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
+ unsigned long long now;
+ unsigned long run_time;
int idx;
/*
@@ -1288,7 +1328,11 @@ need_resched:
rq = this_rq();
release_kernel_lock(prev);
- prev->last_run = jiffies;
+ now = sched_clock();
+ if (likely(now - prev->timestamp < MAX_SLEEP_AVG))
+ run_time = now - prev->timestamp;
+ else
+ run_time = MAX_SLEEP_AVG;
spin_lock_irq(&rq->lock);
/*
@@ -1336,12 +1380,25 @@ pick_next_task:
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
+ if (next->activated) {
+ next->activated = 0;
+ array = next->array;
+ dequeue_task(next, array);
+ recalc_task_prio(next, now);
+ enqueue_task(next, array);
+ }
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;
+ prev->sleep_avg -= run_time;
+ if ((long)prev->sleep_avg < 0)
+ prev->sleep_avg = 0;
+ prev->timestamp = now;
+
if (likely(prev != next)) {
+ next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
@@ -2362,6 +2419,12 @@ static void move_task_away(struct task_s
local_irq_restore(flags);
}
+typedef struct {
+ int cpu;
+ struct completion startup_done;
+ task_t *task;
+} migration_startup_t;
+
/*
* migration_thread - this is a highprio system thread that performs
* thread migration by bumping thread off CPU then 'pushing' onto
@@ -2371,20 +2434,21 @@ static int migration_thread(void * data)
{
/* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
- int cpu = (long) data;
+ migration_startup_t *startup = data;
+ int cpu = startup->cpu;
runqueue_t *rq;
int ret;
+ startup->task = current;
+ complete(&startup->startup_done);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+
+ BUG_ON(smp_processor_id() != cpu);
+
daemonize("migration/%d", cpu);
set_fs(KERNEL_DS);
- /*
- * Either we are running on the right CPU, or there's a a
- * migration thread on this CPU, guaranteed (we're started
- * serially).
- */
- set_cpus_allowed(current, 1UL << cpu);
-
ret = setscheduler(0, SCHED_FIFO, ¶m);
rq = this_rq();
@@ -2420,13 +2484,30 @@ static int migration_call(struct notifie
unsigned long action,
void *hcpu)
{
+ long cpu = (long) hcpu;
+ migration_startup_t startup;
+
switch (action) {
case CPU_ONLINE:
- printk("Starting migration thread for cpu %li\n",
- (long)hcpu);
- kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
- while (!cpu_rq((long)hcpu)->migration_thread)
+
+ printk("Starting migration thread for cpu %li\n", cpu);
+
+ startup.cpu = cpu;
+ startup.task = NULL;
+ init_completion(&startup.startup_done);
+
+ kernel_thread(migration_thread, &startup, CLONE_KERNEL);
+ wait_for_completion(&startup.startup_done);
+ wait_task_inactive(startup.task);
+
+ startup.task->thread_info->cpu = cpu;
+ startup.task->cpus_allowed = cpumask_of_cpu(cpu);
+
+ wake_up_process(startup.task);
+
+ while (!cpu_rq(cpu)->migration_thread)
yield();
+
break;
}
return NOTIFY_OK;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Thu Jul 31 2003 - 22:00:27 EST