[patch] fix for xtime SMP race

Andrea Arcangeli (andrea@e-mind.com)
Thu, 24 Dec 1998 18:22:59 +0100 (CET)


Some days ago I fixed lost_ticks in gettimeofday. But get_fast_time() can
be run inside an irq handler (as in netif_rx()) and cli() is not enough to
protect us from a xtime SMP race.

Let me explain further the scenario. In SMP running only cli() at the top
of do_gettimeofday() is not enough (and it's also too much in all other
cases). This because a cli() will revert to a __cli() if we are just
inside an irq handler (as in the ethernet case). So we are sure to not be
interrupted in do_gettimeofday() but the irq0 could still run on the other
CPU, the same is true for the timer_bh handler. Note right now 2.1.132 is
perfectly safe in UP. Holding the global_cli is also too much if
gettimeofday is called from userspace since we don' t care if other irqs
will run on the other CPU. We can' t allow irqs to run also on our CPU
only to avoid to lockup recursing on the same rw_spinlock.

Here the patch against 2.1.132 to make get_time_fast() SMP safe:

Index: linux/arch/i386/kernel/time.c
diff -u linux/arch/i386/kernel/time.c:1.1.1.2 linux/arch/i386/kernel/time.c:1.1.1.1.2.12
--- linux/arch/i386/kernel/time.c:1.1.1.2 Sun Dec 20 16:26:04 1998
+++ linux/arch/i386/kernel/time.c Thu Dec 24 18:19:01 1998
@@ -22,8 +22,10 @@
* ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
* 1998-12-16 Andrea Arcangeli
* Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
- * because was not accounting lost_ticks. I also removed some ugly
- * not needed global cli() and where needed I used a disable_irq(0).
+ * because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
+ * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
*/

/* What about the "updated NTP code" stuff in 2.0 time.c? It's not in
@@ -82,6 +84,8 @@
*/
static unsigned long fast_gettimeoffset_quotient=0;

+extern rwlock_t xtime_lock;
+
static unsigned long do_fast_gettimeoffset(void)
{
register unsigned long eax asm("ax");
@@ -237,12 +241,12 @@
extern volatile unsigned long lost_ticks;
unsigned long flags;

- save_flags(flags); cli();
+ read_lock_irqsave(&xtime_lock, flags);
*tv = xtime;
tv->tv_usec += do_gettimeoffset();
if (lost_ticks)
tv->tv_usec += lost_ticks * (1000000/HZ);
- restore_flags(flags);
+ read_unlock_irqrestore(&xtime_lock, flags);
while (tv->tv_usec >= 1000000) {
tv->tv_usec -= 1000000;
tv->tv_sec++;
@@ -251,7 +255,7 @@

void do_settimeofday(struct timeval *tv)
{
- cli();
+ write_lock_irq(&xtime_lock);
/* This is revolting. We need to set the xtime.tv_usec
* correctly. However, the value in this location is
* is value at the last tick.
@@ -269,7 +273,7 @@
time_state = TIME_BAD;
time_maxerror = MAXPHASE;
time_esterror = MAXPHASE;
- sti();
+ write_unlock_irq(&xtime_lock);
}

/*
@@ -344,7 +348,7 @@
* timer_interrupt() needs to keep up the real-time clock,
* as well as call the "do_timer()" routine every clocktick
*/
-static inline void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
do_timer(regs);
/*
@@ -398,37 +402,56 @@
#endif
}

+static int use_tsc = 0;
+
/*
* This is the same as the above, except we _also_ save the current
* Time Stamp Counter value at the time of the timer interrupt, so that
* we later on can estimate the time of day more exactly.
*/
-static void pentium_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
int count;

- /* It is important that these two operations happen almost at the
- * same time. We do the RDTSC stuff first, since it's faster. To
- * avoid any inconsistencies, we need interrupts disabled locally.
- */
-
/*
- * Interrupts are just disabled locally since the timer irq has the
- * SA_INTERRUPT flag set. -arca
+ * Here we are in the timer irq handler. We just have irqs locally
+ * disabled but we don't know if the timer_bh is running on the other
+ * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+ * the irq version of write_lock because as just said we have irq
+ * locally disabled. -arca
*/
+ write_lock(&xtime_lock);
+
+ if (use_tsc)
+ {
+ /*
+ * It is important that these two operations happen almost at
+ * the same time. We do the RDTSC stuff first, since it's
+ * faster. To avoid any inconsistencies, we need interrupts
+ * disabled locally.
+ */
+
+ /*
+ * Interrupts are just disabled locally since the timer irq
+ * has the SA_INTERRUPT flag set. -arca
+ */

- /* read Pentium cycle counter */
- __asm__("rdtsc" : "=a" (last_tsc_low) : : "edx");
+ /* read Pentium cycle counter */
+ __asm__("rdtsc" : "=a" (last_tsc_low) : : "edx");

- outb_p(0x00, 0x43); /* latch the count ASAP */
+ outb_p(0x00, 0x43); /* latch the count ASAP */

- count = inb_p(0x40); /* read the latched count */
- count |= inb(0x40) << 8;
+ count = inb_p(0x40); /* read the latched count */
+ count |= inb(0x40) << 8;

- count = ((LATCH-1) - count) * TICK_SIZE;
- delay_at_last_interrupt = (count + LATCH/2) / LATCH;
+ count = ((LATCH-1) - count) * TICK_SIZE;
+ delay_at_last_interrupt = (count + LATCH/2) / LATCH;
+ }

- timer_interrupt(irq, NULL, regs);
+ do_timer_interrupt(irq, NULL, regs);
+
+ write_unlock(&xtime_lock);
+
}

/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
@@ -601,7 +624,7 @@
if (boot_cpu_data.x86_capability & X86_FEATURE_TSC) {
do_gettimeoffset = do_fast_gettimeoffset;
do_get_fast_time = do_gettimeofday;
- irq0.handler = pentium_timer_interrupt;
+ use_tsc = 1;
fast_gettimeoffset_quotient = calibrate_tsc();

/* report CPU clock rate in Hz.
Index: linux/kernel/sched.c
diff -u linux/kernel/sched.c:1.1.1.2 linux/kernel/sched.c:1.1.1.1.2.31
--- linux/kernel/sched.c:1.1.1.2 Fri Nov 27 11:19:09 1998
+++ linux/kernel/sched.c Thu Dec 24 18:19:00 1998
@@ -7,6 +7,11 @@
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
*/

/*
@@ -149,6 +156,7 @@
init_task.next_run = p;
p->next_run = next;
next->prev_run = p;
+ nr_running++;
}

static inline void del_from_runqueue(struct task_struct * p)
@@ -227,7 +235,6 @@
if (!p->next_run) {
add_to_runqueue(p);
reschedule_idle(p);
- nr_running++;
}
spin_unlock_irqrestore(&runqueue_lock, flags);
}
@@ -437,23 +448,6 @@
struct timer_list timer;
unsigned long expire;

- /*
- * PARANOID.
- */
- if (current->state == TASK_UNINTERRUPTIBLE)
- {
- printk(KERN_WARNING "schedule_timeout: task not interrutible "
- "from %p\n", __builtin_return_address(0));
- /*
- * We don' t want to interrupt a not interruptible task
- * risking to cause corruption. Better a a deadlock ;-).
- */
- timeout = MAX_SCHEDULE_TIMEOUT;
- }
-
- /*
- * Here we start for real.
- */
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
@@ -594,10 +588,12 @@

#ifdef __SMP__
next->has_cpu = 1;
- next->processor = this_cpu;
#endif

if (prev != next) {
+#ifdef __SMP__
+ next->processor = this_cpu;
+#endif
kstat.context_swtch++;
get_mmu_context(next);
switch_to(prev,next);
@@ -1189,13 +1185,21 @@
volatile unsigned long lost_ticks = 0;
static unsigned long lost_ticks_system = 0;

+/*
+ * This spinlock protect us from races in SMP while playing with xtime. -arca
+ */
+rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
+
static inline void update_times(void)
{
unsigned long ticks;
- unsigned long flags;

- save_flags(flags);
- cli();
+ /*
+ * update_times() is run from the raw timer_bh handler so we
+ * just know that the irqs are locally enabled and so we don't
+ * need to save/restore the flags of the local CPU here. -arca
+ */
+ write_lock_irq(&xtime_lock);

ticks = lost_ticks;
lost_ticks = 0;
@@ -1206,12 +1210,12 @@

calc_load(ticks);
update_wall_time(ticks);
- restore_flags(flags);
+ write_unlock_irq(&xtime_lock);

update_process_times(ticks, system);

} else
- restore_flags(flags);
+ write_unlock_irq(&xtime_lock);
}

static void timer_bh(void)

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/