scheduler bigpatch is out

Rik van Riel (H.H.vanRiel@phys.uu.nl)
Tue, 20 Oct 1998 16:32:43 +0200 (CEST)


Hi,

this is an announcement for the scheduler bigpatch which
combines:
- SCHED_IDLE
- improved RT latency (Richard's patch)
- improved interactive performance (my patch)

This patch is known to compile perfectly, but due to a
long-time background job I haven't booted it yet.
Because the patch mostly consists of tried & tested
code, it'll probably be safe.

You can get it from my homepage, together with a small
program you can use to set RT and idle priority on
processes. For the web-challenged folks out here, I've
also attached a version of the patch to this message.
It should patch cleanly against 2.1.125.

As usual, I'm interested in bug-reports and general
remarks.

cheers,

Rik.
+-------------------------------------------------------------------+
| Linux memory management tour guide. H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader. http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

diff -ur linux-2.1.125/fs/proc/array.c linux-2.1.125-rik/fs/proc/array.c
--- linux-2.1.125/fs/proc/array.c Tue Oct 20 13:49:54 1998
+++ linux-2.1.125-rik/fs/proc/array.c Tue Oct 20 15:12:11 1998
@@ -37,6 +37,8 @@
* Hans Marcus <crowbar@concepts.nl>
*
* aeb@cwi.nl : /proc/partitions
+ * Rik van Riel : only return task priority for SCHED_OTHER tasks,
+ * p->counter doesn't make sense for RT & idle tasks.
*/

#include <linux/types.h>
@@ -864,6 +866,9 @@
priority = 20 - (priority * 10 + DEF_PRIORITY / 2) / DEF_PRIORITY;
nice = tsk->priority;
nice = 20 - (nice * 20 + DEF_PRIORITY / 2) / DEF_PRIORITY;
+ /* These values don't make sense for RT and idle tasks. */
+ if (tsk->policy != SCHED_OTHER)
+ nice = 0, priority = 0;

return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
diff -ur linux-2.1.125/include/linux/sched.h linux-2.1.125-rik/include/linux/sched.h
--- linux-2.1.125/include/linux/sched.h Tue Oct 20 13:48:47 1998
+++ linux-2.1.125-rik/include/linux/sched.h Tue Oct 20 15:53:10 1998
@@ -87,10 +87,11 @@
#define SCHED_OTHER 0
#define SCHED_FIFO 1
#define SCHED_RR 2
+#define SCHED_IDLE 3

/*
* This is an additional bit set when we want to
- * yield the CPU for one re-schedule..
+ * yield the CPU for one re-schedule (non-RT only)..
*/
#define SCHED_YIELD 0x10

@@ -227,6 +228,7 @@
int last_processor;
int lock_depth; /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */
struct task_struct *next_task, *prev_task;
+ struct task_struct *run_head; /* The run queue I am destined for */
struct task_struct *next_run, *prev_run;

/* task state */
@@ -259,6 +261,7 @@

struct wait_queue *wait_chldexit; /* for wait4() */
unsigned long timeout, policy, rt_priority;
+ unsigned long sleep_time; /* when did we go to sleep? */
unsigned long it_real_value, it_prof_value, it_virt_value;
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
@@ -270,7 +273,6 @@
int swappable:1;
unsigned long swap_address;
unsigned long old_maj_flt; /* old value of maj_flt */
- unsigned long dec_flt; /* page fault count of the last time */
unsigned long swap_cnt; /* number of pages to swap on next pass */
/* process credentials */
uid_t uid,euid,suid,fsuid;
@@ -340,7 +342,7 @@
/* state etc */ { 0,0,0,KERNEL_DS,&default_exec_domain,0, \
/* counter */ DEF_PRIORITY,DEF_PRIORITY, \
/* SMP */ 0,0,0,-1, \
-/* schedlink */ &init_task,&init_task, &init_task, &init_task, \
+/* schedlink */ &init_task,&init_task, &init_task, &init_task, &init_task, \
/* binfmt */ NULL, \
/* ec,brk... */ 0,0,0,0,0,0, \
/* pid etc.. */ 0,0,0,0,0, \
@@ -348,12 +350,12 @@
/* pidhash */ NULL, NULL, \
/* tarray */ &task[0], \
/* chld wait */ NULL, \
-/* timeout */ 0,SCHED_OTHER,0,0,0,0,0,0,0, \
+/* timeout */ 0,SCHED_OTHER,0,0,0,0,0,0,0,0, \
/* timer */ { NULL, NULL, 0, 0, it_real_fn }, \
/* utime */ {0,0,0,0},0, \
/* per CPU times */ {0, }, {0, }, \
/* flt */ 0,0,0,0,0,0, \
-/* swp */ 0,0,0,0,0, \
+/* swp */ 0,0,0,0, \
/* process credentials */ \
/* uid etc */ 0,0,0,0,0,0,0,0, \
/* suppl grps*/ 0, {0,}, \
diff -ur linux-2.1.125/kernel/sched.c linux-2.1.125-rik/kernel/sched.c
--- linux-2.1.125/kernel/sched.c Fri Sep 25 16:43:12 1998
+++ linux-2.1.125-rik/kernel/sched.c Tue Oct 20 16:19:52 1998
@@ -99,6 +99,17 @@

struct task_struct * task[NR_TASKS] = {&init_task, };

+/*
+ * Pseudo-idle task for RT run queue. I may get around to doing
+ * a partial allocation here (evil). rgooch@atnf.csiro.au
+ */
+
+static struct task_struct rt_idle = {
+ rt_priority: -1000,
+ prev_run: &rt_idle,
+ next_run: &rt_idle,
+};
+
struct kernel_stat kstat = { 0 };

void scheduling_functions_start_here(void) { }
@@ -155,10 +166,20 @@
*/
static inline void add_to_runqueue(struct task_struct * p)
{
- struct task_struct *next = init_task.next_run;
+ struct task_struct *head = p->run_head;
+ struct task_struct *next = head->next_run;

- p->prev_run = &init_task;
- init_task.next_run = p;
+ if (p->policy != 0 || p->sleep_time == jiffies)
+ goto fastpath;
+ p->counter += (jiffies - p->sleep_time) >>
+ ((DEF_PRIORITY - p->priority) >> 2);
+ /* if we sleep 'too long' or if jiffies overflows, the value
+ will be normalized here (p->counter is signed) */
+ if (p->counter > 2 * p->priority || p->counter < 0)
+ p->counter = 2 * p->priority;
+fastpath:
+ p->prev_run = head;
+ head->next_run = p;
p->next_run = next;
next->prev_run = p;
}
@@ -173,10 +194,12 @@
prev->next_run = next;
p->next_run = NULL;
p->prev_run = NULL;
+ p->sleep_time = jiffies;
}

static inline void move_last_runqueue(struct task_struct * p)
{
+ struct task_struct *head = p->run_head;
struct task_struct *next = p->next_run;
struct task_struct *prev = p->prev_run;

@@ -184,15 +207,16 @@
next->prev_run = prev;
prev->next_run = next;
/* add back to list */
- p->next_run = &init_task;
- prev = init_task.prev_run;
- init_task.prev_run = p;
+ p->next_run = head;
+ prev = head->prev_run;
+ head->prev_run = p;
p->prev_run = prev;
prev->next_run = p;
}

static inline void move_first_runqueue(struct task_struct * p)
{
+ struct task_struct *head = p->run_head;
struct task_struct *next = p->next_run;
struct task_struct *prev = p->prev_run;

@@ -200,9 +224,9 @@
next->prev_run = prev;
prev->next_run = next;
/* add back to list */
- p->prev_run = &init_task;
- next = init_task.next_run;
- init_task.next_run = p;
+ p->prev_run = head;
+ next = head->next_run;
+ head->next_run = p;
p->next_run = next;
next->prev_run = p;
}
@@ -276,14 +300,6 @@
}

/*
- * Realtime process, select the first one on the
- * runqueue (taking priorities within processes
- * into account).
- */
- if (policy != SCHED_OTHER)
- return 1000 + p->rt_priority;
-
- /*
* Give the process a first-approximation goodness value
* according to the number of clock-ticks it has left.
*
@@ -303,7 +319,14 @@
/* .. and a slight advantage to the current thread */
if (p->mm == prev->mm)
weight += 1;
+ /* Give normal tasks an advantage over niced tasks,
+ even when their timeslice is almost over */
weight += p->priority;
+
+ /* Subtle! schedule() below depends on us returning
+ 0 when we need to recalculate priorities. */
+ if (p->policy == SCHED_IDLE)
+ return p->counter - 1000;
}

return weight;
@@ -499,7 +522,8 @@
case TASK_RUNNING:
}
{
- struct task_struct * p = init_task.next_run;
+ struct task_struct * p = rt_idle.next_run;
+ struct task_struct * op = init_task.next_run;
/*
* This is subtle.
* Note how we can enable interrupts here, even
@@ -524,6 +548,17 @@
{
int c = -1000;
next = idle_task;
+
+ /* First scan for RT processes to run: ignore others */
+ while (p != &rt_idle) {
+ if (can_schedule(p)) {
+ int weight = prev->rt_priority;
+ if (weight > c)
+ c = weight, next = p;
+ }
+ p = p->next_run;
+ }
+ p = (next == idle_task) ? op : &init_task;
while (p != &init_task) {
if (can_schedule(p)) {
int weight = goodness(p, prev, this_cpu);
@@ -534,11 +569,19 @@
}

/* Do we need to re-calculate counters? */
- if (!c) {
+ if (c == 0) {
struct task_struct *p;
read_lock(&tasklist_lock);
- for_each_task(p)
- p->counter = (p->counter >> 1) + p->priority;
+ p = init_task.next_run;
+ while (p != &init_task) {
+ /* maybe someone added a task, keep the p->counter */
+ p->counter >>= 1;
+ if (p->policy == SCHED_OTHER)
+ p->counter += p->priority;
+ else /* SCHED_IDLE, long slices */
+ p->counter += 499;
+ p = p->next_run;
+ }
read_unlock(&tasklist_lock);
}
}
@@ -1349,7 +1392,7 @@
struct sched_param *param)
{
struct sched_param lp;
- struct task_struct *p;
+ struct task_struct *p, *head;
int retval;

retval = -EINVAL;
@@ -1378,7 +1421,7 @@
else {
retval = -EINVAL;
if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_OTHER)
+ policy != SCHED_OTHER && policy != SCHED_IDLE)
goto out_unlock;
}

@@ -1389,7 +1432,7 @@
retval = -EINVAL;
if (lp.sched_priority < 0 || lp.sched_priority > 99)
goto out_unlock;
- if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
+ if ((policy == SCHED_OTHER || policy == SCHED_IDLE) != (lp.sched_priority == 0))
goto out_unlock;

retval = -EPERM;
@@ -1399,12 +1442,21 @@
if ((current->euid != p->euid) && (current->euid != p->uid) &&
!capable(CAP_SYS_NICE))
goto out_unlock;
+ if ((p->policy == SCHED_IDLE) && (policy != SCHED_IDLE) &&
+ !capable(CAP_SYS_NICE))
+ goto out_unlock;

retval = 0;
p->policy = policy;
p->rt_priority = lp.sched_priority;
- if (p->next_run)
- move_first_runqueue(p);
+ head = (policy == SCHED_OTHER || policy == SCHED_IDLE) ? &init_task : &rt_idle;
+ if (p->next_run) {
+ del_from_runqueue(p);
+ p->run_head = head;
+ add_to_runqueue(p);
+ nr_running++;
+ }
+ else p->run_head = head;

current->need_resched = 1;

@@ -1488,7 +1540,8 @@
{
spin_lock(&scheduler_lock);
spin_lock_irq(&runqueue_lock);
- current->policy |= SCHED_YIELD;
+ if (current->policy == SCHED_OTHER || current->policy == SCHED_IDLE)
+ current->policy |= SCHED_YIELD;
current->need_resched = 1;
move_last_runqueue(current);
spin_unlock_irq(&runqueue_lock);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/