Re: High irq load (Re: [PATCH] i386: Selectable Frequency of the Timer Interrupt)

From: Peter Osterlund
Date: Thu Jul 14 2005 - 09:27:10 EST


Linus Torvalds <torvalds@xxxxxxxx> writes:

> On Wed, 13 Jul 2005, Jan Engelhardt wrote:
> >
> > No, some kernel code causes a triple-fault-and-reboot when the HZ is >=
> > 10KHz. Maybe the highest possible value is 8192 Hz, not sure.
>
> Can you post the triple-fault message? It really shouldn't triple-fault,
> although it _will_ obviously spend all time just doing timer interrupts,
> so it shouldn't get much (if any) real work done either.
...
> There should be no conceptual "highest possible HZ", although there are
> certainly obvious practical limits to it (both on the timer hw itself, and
> just the fact that at some point we'll spend all time on the timer
> interrupt and won't get anything done..)

HZ=10000 appears to work fine here after some hacks to avoid
over/underflows in integer arithmetics. gkrellm reports about 3-4% CPU
usage when the system is idle, on a 3.07 GHz P4.

---

Makefile | 2 +-
arch/i386/kernel/cpu/proc.c | 6 ++++++
fs/nfsd/nfssvc.c | 2 +-
include/linux/jiffies.h | 6 ++++++
include/linux/nfsd/stats.h | 4 ++++
include/linux/timex.h | 2 +-
include/net/tcp.h | 12 +++++++++---
init/calibrate.c | 21 +++++++++++++++++++++
kernel/Kconfig.hz | 6 ++++++
kernel/timer.c | 4 ++--
net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 2 +-
11 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 13
-EXTRAVERSION =-rc3
+EXTRAVERSION =-rc3-test
NAME=Woozy Numbat

# *DOCUMENTATION*
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -128,9 +128,15 @@ static int show_cpuinfo(struct seq_file
x86_cap_flags[i] != NULL )
seq_printf(m, " %s", x86_cap_flags[i]);

+#if HZ <= 5000
seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
+#else
+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
+ c->loops_per_jiffy/(500000/HZ),
+ (c->loops_per_jiffy*(HZ/5000)) % 100);
+#endif

return 0;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -160,7 +160,7 @@ update_thread_usage(int busy_threads)
decile = busy_threads*10/nfsdstats.th_cnt;
if (decile>0 && decile <= 10) {
diff = nfsd_last_call - prev_call;
- if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
+ if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
if (decile == 10)
nfsdstats.th_fullcnt++;
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -38,6 +38,12 @@
# define SHIFT_HZ 9
#elif HZ >= 768 && HZ < 1536
# define SHIFT_HZ 10
+#elif HZ >= 1536 && HZ < 3072
+# define SHIFT_HZ 11
+#elif HZ >= 3072 && HZ < 6144
+# define SHIFT_HZ 12
+#elif HZ >= 6144 && HZ < 12288
+# define SHIFT_HZ 13
#else
# error You lose.
#endif
diff --git a/include/linux/nfsd/stats.h b/include/linux/nfsd/stats.h
--- a/include/linux/nfsd/stats.h
+++ b/include/linux/nfsd/stats.h
@@ -30,7 +30,11 @@ struct nfsd_stats {
};

/* thread usage wraps very million seconds (approx one fortnight) */
+#if HZ < 2048
#define NFSD_USAGE_WRAP (HZ*1000000)
+#else
+#define NFSD_USAGE_WRAP (2048*1000000)
+#endif

#ifdef __KERNEL__

diff --git a/include/linux/timex.h b/include/linux/timex.h
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -90,7 +90,7 @@
*
* FINENSEC is 1 ns in SHIFT_UPDATE units of the time_phase variable.
*/
-#define SHIFT_SCALE 22 /* phase scale (shift) */
+#define SHIFT_SCALE 25 /* phase scale (shift) */
#define SHIFT_UPDATE (SHIFT_KG + MAXTC) /* time offset scale (shift) */
#define SHIFT_USEC 16 /* frequency offset scale (shift) */
#define FINENSEC (1L << (SHIFT_SCALE - 10)) /* ~1 ns in phase units */
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -486,8 +486,8 @@ static __inline__ int tcp_sk_listen_hash
so that we select tick to get range about 4 seconds.
*/

-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
+#if HZ <= 16
+# error Unsupported: HZ <= 16
#elif HZ <= 32
# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
@@ -502,8 +502,14 @@ static __inline__ int tcp_sk_listen_hash
# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#else
+#elif HZ <= 4096
# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
+#elif HZ <= 8192
+# define TCP_TW_RECYCLE_TICK (13+2-TCP_TW_RECYCLE_SLOTS_LOG)
+#elif HZ <= 16384
+# define TCP_TW_RECYCLE_TICK (14+2-TCP_TW_RECYCLE_SLOTS_LOG)
+#else
+# error Unsupported: HZ > 16384
#endif
/*
* TCP option
diff --git a/init/calibrate.c b/init/calibrate.c
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -119,16 +119,30 @@ void __devinit calibrate_delay(void)

if (preset_lpj) {
loops_per_jiffy = preset_lpj;
+#if HZ <= 5000
printk("Calibrating delay loop (skipped)... "
"%lu.%02lu BogoMIPS preset\n",
loops_per_jiffy/(500000/HZ),
(loops_per_jiffy/(5000/HZ)) % 100);
+#else
+ printk("Calibrating delay loop (skipped)... "
+ "%lu.%02lu BogoMIPS preset\n",
+ loops_per_jiffy/(500000/HZ),
+ (loops_per_jiffy*(HZ/5000)) % 100);
+#endif
} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
printk("Calibrating delay using timer specific routine.. ");
+#if HZ <= 5000
printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
loops_per_jiffy/(500000/HZ),
(loops_per_jiffy/(5000/HZ)) % 100,
loops_per_jiffy);
+#else
+ printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+ loops_per_jiffy/(500000/HZ),
+ (loops_per_jiffy*(HZ/5000)) % 100,
+ loops_per_jiffy);
+#endif
} else {
loops_per_jiffy = (1<<12);

@@ -164,10 +178,17 @@ void __devinit calibrate_delay(void)
}

/* Round the value and print it */
+#if HZ <= 5000
printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
loops_per_jiffy/(500000/HZ),
(loops_per_jiffy/(5000/HZ)) % 100,
loops_per_jiffy);
+#else
+ printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+ loops_per_jiffy/(500000/HZ),
+ (loops_per_jiffy*(HZ/5000)) % 100,
+ loops_per_jiffy);
+#endif
}

}
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -36,6 +36,11 @@ choice
1000 HZ is the preferred choice for desktop systems and other
systems requiring fast interactive responses to events.

+ config HZ_10000
+ bool "10000 HZ"
+ help
+ 10000 HZ is for testing only.
+
endchoice

config HZ
@@ -43,4 +48,5 @@ config HZ
default 100 if HZ_100
default 250 if HZ_250
default 1000 if HZ_1000
+ default 10000 if HZ_10000

diff --git a/kernel/timer.c b/kernel/timer.c
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -710,7 +710,7 @@ static void second_overflow(void)
if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
time_offset += ltemp;
- time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
} else {
ltemp = time_offset;
if (!(time_status & STA_FLL))
@@ -718,7 +718,7 @@ static void second_overflow(void)
if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
time_offset -= ltemp;
- time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
}

/*
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -87,7 +87,7 @@ static const char *tcp_conntrack_names[]

unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS;
unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS;
-unsigned long ip_ct_tcp_timeout_established = 5 DAYS;
+unsigned long ip_ct_tcp_timeout_established = 2 DAYS;
unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS;
unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS;
unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS;

--
Peter Osterlund - petero2@xxxxxxxxx
http://web.telia.com/~u89404340
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/