Re: [patch] jiffies wraparound [Re: 2.1.125 Show stopper list: Draft]

Andrea Arcangeli (andrea@e-mind.com)
Sun, 18 Oct 1998 14:28:40 +0200 (CEST)


On Sun, 18 Oct 1998, MOLNAR Ingo wrote:

>
>On Sun, 18 Oct 1998, Alan Cox wrote:
>
>> > It's a beauty wart, it's been there forever, and we know that machines
>> > actually tend to survive it fine. So far I have _one_ person who has
>>
>> 1.2 survives mostly, 2.0 survives mostly, 2.1.x doesnt survive
>
>hm, i've just reviewed the timer code from this angle again, and there is
>a suspicious piece of code:
>
> } else if (expires < timer_jiffies) {
> /* can happen if you add a timer with expires == jiffies,
> * or you set a timer to go off in the past
> */
> insert_timer(timer, tv1.vec, tv1.index);
>=====> } else if (idx < 0xffffffffUL) {
> int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
> insert_timer(timer, tv5.vec, i);
> } else {
> /* Can only get here on architectures with 64-bit jiffies */
> timer->next = timer->prev = timer;
> }
>
>shouldnt the (idx < 0xffffffffUL) condition be: (idx <= 0xffffffffUL)?

Yes, thanks.

Note that also the (expires < timer_jiffies) is wrong it should be
(signed long)idx<0.

>but this shouldnt happen too often i think when we just randomly wrap
>jiffies (without heavy load that delays the processing of timer bhs), so i
>guess there is something else going on.

I am not sure that the stock 2.1 will die completly as it seems from my
testing. I think it would be better that it die completly though.

I am not sure because I fixed some checks in the core code of the kernel
(like schedule()) but many buggy places of the kernel use uses

current->timeout = -1;
schedule();

So fixing some piece of kernel could cause other buggy code to fail
(because with my fix timeout = -1 came before than jiffies...).

Right now I just converted every tsk->timeout to tsk->use_timeout:1 and
tsk->timeout_value and the booted kernel is rock stable with
JIFFIES_OFFSET=0 and =-120*HZ and resist to the wrap fine. I fixed all
that wrong -1 in the tty drivers.

Look at this part of my lastest stuff (not published yet) for example:

Index: drivers/char/n_tty.c
===================================================================
RCS file: /var/cvs/linux/drivers/char/n_tty.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.12.3
diff -u -r1.1.1.1 -r1.1.1.1.12.3
--- n_tty.c 1998/10/02 17:23:13 1.1.1.1
+++ n_tty.c 1998/10/18 00:18:29 1.1.1.1.12.3
@@ -419,7 +419,7 @@
char buf[64];

tty->num_overrun++;
- if (tty->overrun_time < (jiffies - HZ)) {
+ if (time_before(tty->overrun_time, jiffies - HZ)) {
printk("%s: %d input overrun(s)\n", tty_name(tty, buf),
tty->num_overrun);
tty->overrun_time = jiffies;
@@ -900,7 +900,8 @@
}

minimum = time = 0;
- current->timeout = (unsigned long) -1;
+ current->use_timeout = 1;
+ current->timeout_value = jiffies + (~0UL >> 2);
if (!tty->icanon) {
time = (HZ / 10) * TIME_CHAR(tty);
minimum = MIN_CHAR(tty);
@@ -911,9 +912,10 @@
(tty->minimum_to_wake > minimum))
tty->minimum_to_wake = minimum;
} else {
- current->timeout = 0;
+ current->use_timeout = 0;
if (time) {
- current->timeout = time + jiffies;
+ current->use_timeout = 1;
+ current->timeout_value = time + jiffies;
time = 0;
}
tty->minimum_to_wake = minimum = 1;
@@ -949,7 +951,7 @@
}
if (tty_hung_up_p(file))
break;
- if (!current->timeout)
+ if (!current->use_timeout)
break;
if (file->f_flags & O_NONBLOCK) {
retval = -EAGAIN;
@@ -1021,7 +1023,10 @@
if (b - buf >= minimum)
break;
if (time)
- current->timeout = time + jiffies;
+ {
+ current->use_timeout = 1;
+ current->timeout_value = time + jiffies;
+ }
}
enable_bh(TQUEUE_BH);
remove_wait_queue(&tty->read_wait, &wait);
@@ -1030,7 +1035,7 @@
tty->minimum_to_wake = minimum;

current->state = TASK_RUNNING;
- current->timeout = 0;
+ current->use_timeout = 0;
size = b - buf;
if (size) {
retval = size;

Index: kernel/sched.c
===================================================================
RCS file: /var/cvs/linux/kernel/sched.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 sched.c
--- sched.c 1998/10/02 17:22:39 1.1.1.1
+++ sched.c 1998/10/18 12:23:35
@@ -90,7 +90,7 @@

extern void mem_use(void);

-unsigned long volatile jiffies=0;
+unsigned long volatile jiffies=JIFFIES_OFFSET;

/*
* Init task must be ok at boot for the ix86 as we will check its signals
@@ -248,7 +248,7 @@
{
struct task_struct * p = (struct task_struct *) __data;

- p->timeout = 0;
+ p->use_timeout = 0;
wake_up_process(p);
}

@@ -341,7 +341,7 @@

#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))

-static unsigned long timer_jiffies = 0;
+static unsigned long timer_jiffies = JIFFIES_OFFSET;

static inline void insert_timer(struct timer_list *timer,
struct timer_list **vec, int idx)
@@ -372,12 +372,12 @@
} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
insert_timer(timer, tv4.vec, i);
- } else if (expires < timer_jiffies) {
+ } else if ((signed long) idx < 0) {
/* can happen if you add a timer with expires == jiffies,
* or you set a timer to go off in the past
*/
insert_timer(timer, tv1.vec, tv1.index);
- } else if (idx < 0xffffffffUL) {
+ } else if (idx <= 0xffffffffUL) {
int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
insert_timer(timer, tv5.vec, i);
} else {
@@ -458,8 +458,7 @@
asmlinkage void schedule(void)
{
struct task_struct * prev, * next;
- unsigned long timeout;
- int this_cpu;
+ int this_cpu, timeout = 0;

prev = current;
this_cpu = prev->processor;
@@ -481,15 +480,16 @@
prev->counter = prev->priority;
move_last_runqueue(prev);
}
- timeout = 0;
+
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev))
goto makerunnable;
- timeout = prev->timeout;
- if (timeout && (timeout <= jiffies)) {
- prev->timeout = 0;
- timeout = 0;
+ timeout = prev->use_timeout;
+ if (timeout &&
+ time_before_eq(prev->timeout_value, jiffies))
+ {
+ prev->use_timeout = 0;
makerunnable:
prev->state = TASK_RUNNING;
break;
@@ -555,7 +555,7 @@
kstat.context_swtch++;
if (timeout) {
init_timer(&timer);
- timer.expires = timeout;
+ timer.expires = prev->timeout_value;
timer.data = (unsigned long) prev;
timer.function = process_timeout;
add_timer(&timer);
@@ -803,7 +803,7 @@
break;
if (!(mask & timer_active))
continue;
- if (tp->expires > jiffies)
+ if (time_after(tp->expires, jiffies))
continue;
timer_active &= ~mask;
tp->fn();
@@ -1565,14 +1565,15 @@

expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies;

- current->timeout = expire;
+ current->use_timeout = 1;
+ current->timeout_value = expire;
current->state = TASK_INTERRUPTIBLE;
schedule();

- if (expire > jiffies) {
+ if (time_after(expire, jiffies)) {
if (rmtp) {
jiffies_to_timespec(expire - jiffies -
- (expire > jiffies + 1), &t);
+ (expire - jiffies > 1), &t);
if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
return -EFAULT;
}

Index: arch/i386/kernel/time.c
===================================================================
RCS file: /var/cvs/linux/arch/i386/kernel/time.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.14.1
diff -u -r1.1.1.1 -r1.1.1.1.14.1
--- time.c 1998/10/02 17:23:36 1.1.1.1
+++ time.c 1998/10/16 18:41:23 1.1.1.1.14.1
@@ -53,6 +53,13 @@
unsigned long high;
} init_timer_cc, last_timer_cc;

+static void __init_timer_cc(void)
+{
+ __asm__("rdtsc"
+ :"=a" (init_timer_cc.low),
+ "=d" (init_timer_cc.high));
+}
+
static unsigned long do_fast_gettimeoffset(void)
{
register unsigned long eax asm("ax");
@@ -68,12 +75,12 @@
*/
static unsigned long cached_quotient=0;

- tmp = jiffies;
+ tmp = jiffies - JIFFIES_OFFSET;

quotient = cached_quotient;
low_timer = last_timer_cc.low;

- if (last_jiffies != tmp) {
+ if (tmp && last_jiffies != tmp) {
last_jiffies = tmp;

/* Get last timer tick in absolute kernel time */
@@ -110,7 +117,7 @@
}

/* Read the time counter */
- __asm__("rdtsc" : "=a" (eax), "=d" (edx));
+ __asm__("rdtsc" : "=a" (eax) : : "edx");

/* .. relative to previous jiffy (32 bits is enough) */
edx = 0;
@@ -191,7 +198,7 @@
* We do this guaranteed double memory access instead of a _p
* postfix in the previous port access. Wheee, hackady hack
*/
- jiffies_t = jiffies;
+ jiffies_t = jiffies - JIFFIES_OFFSET;

count |= inb_p(0x40) << 8;

@@ -380,6 +387,7 @@
static inline void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
do_timer(regs);
+
/*
* In the SMP case we use the local APIC timer interrupt to do the
* profiling, except when we simulate SMP mode on a uniprocessor
@@ -444,6 +452,14 @@
:"=a" (last_timer_cc.low),
"=d" (last_timer_cc.high));
timer_interrupt(irq, NULL, regs);
+
+ if (!(jiffies - JIFFIES_OFFSET))
+ /*
+ * If jiffies has overflowed in this timer_interrupt we must
+ * update the init_timer_cc to make the do_fast_gettimeoffset()
+ * quotient calc still valid. -arca
+ */
+ __init_timer_cc();
}
#endif

@@ -552,9 +568,7 @@
}

/* read Pentium cycle counter */
- __asm__("rdtsc"
- :"=a" (init_timer_cc.low),
- "=d" (init_timer_cc.high));
+ __init_timer_cc();
irq0.handler = pentium_timer_interrupt;
}
#endif

Index: fs/select.c
===================================================================
RCS file: /var/cvs/linux/fs/select.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 select.c
--- select.c 1998/10/02 17:22:35 1.1.1.1
+++ select.c 1998/10/18 01:24:53
@@ -124,7 +124,8 @@
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

-int do_select(int n, fd_set_buffer *fds, unsigned long timeout)
+int do_select(int n, fd_set_buffer *fds, int use_timeout,
+ unsigned long timeout_value)
{
poll_table wait_table, *wait;
int retval;
@@ -133,8 +134,8 @@
lock_kernel();

wait = NULL;
- current->timeout = timeout;
- if (timeout) {
+ if ((current->use_timeout = use_timeout))
+ {
struct poll_table_entry *entry = (struct poll_table_entry *)
__get_free_page(GFP_KERNEL);
if (!entry) {
@@ -144,6 +145,7 @@
wait_table.nr = 0;
wait_table.entry = entry;
wait = &wait_table;
+ current->timeout_value = timeout_value;
}

retval = max_select_fd(n, fds);
@@ -190,14 +192,14 @@
}
}
wait = NULL;
- if (retval || !current->timeout || signal_pending(current))
+ if (retval || !current->use_timeout || signal_pending(current))
break;
schedule();
}
current->state = TASK_RUNNING;

out:
- if (timeout) {
+ if (use_timeout) {
free_wait(&wait_table);
free_page((unsigned long) wait_table.entry);
}
@@ -218,10 +220,11 @@
sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
fd_set_buffer *fds;
- unsigned long timeout;
- int ret;
+ unsigned long timeout_value;
+ int ret, use_timeout;

- timeout = ~0UL;
+ use_timeout = 1;
+ timeout_value = jiffies + (~0UL >> 1);
if (tvp) {
time_t sec, usec;

@@ -230,10 +233,12 @@
|| (ret = __get_user(usec, &tvp->tv_usec)))
goto out_nofds;

- timeout = ROUND_UP(usec, 1000000/HZ);
- timeout += sec * (unsigned long) HZ;
- if (timeout)
- timeout += jiffies + 1;
+ timeout_value = ROUND_UP(usec, 1000000/HZ);
+ timeout_value += sec * (unsigned long) HZ;
+ if (timeout_value)
+ timeout_value += jiffies + 1;
+ else
+ use_timeout = 0;
}

ret = -ENOMEM;
@@ -253,10 +258,10 @@
zero_fd_set(n, fds->res_out);
zero_fd_set(n, fds->res_ex);

- ret = do_select(n, fds, timeout);
+ ret = do_select(n, fds, use_timeout, timeout_value);

- if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
- unsigned long timeout = current->timeout - jiffies - 1;
+ if (tvp && use_timeout && !(current->personality & STICKY_TIMEOUTS)) {
+ unsigned long timeout = current->timeout_value - jiffies - 1;
time_t sec = 0, usec = 0;
if ((long) timeout > 0) {
sec = timeout / HZ;
@@ -266,7 +271,7 @@
put_user(sec, &tvp->tv_sec);
put_user(usec, &tvp->tv_usec);
}
- current->timeout = 0;
+ current->use_timeout = 0;

if (ret < 0)
goto out;
@@ -321,7 +326,7 @@
}

wait = NULL;
- if (count || !current->timeout || signal_pending(current))
+ if (count || !current->use_timeout || signal_pending(current))
break;
schedule();
}
@@ -366,9 +371,10 @@
if (copy_from_user(fds, ufds, size))
goto out_fds;

- current->timeout = timeout;
+ current->use_timeout = 1;
+ current->timeout_value = timeout;
fdcount = do_poll(nfds, fds, wait);
- current->timeout = 0;
+ current->use_timeout = 0;

/* OK, now copy the revents fields back to user space. */
fds1 = fds;

My new kernel seems to be jiffies wrap compliant. I am doing some other
test right now (since nanosleep sime to still have some little problem)
but everything else works fine! ;-)

Once I' ll have fixed the nanosleep case I' ll put the new patch in the
usual place (if somebody is interested).

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/