Re: High irq load (Re: [PATCH] i386: Selectable Frequency of the Timer Interrupt)

From: Herbert Poetzl
Date: Mon Jul 18 2005 - 00:55:05 EST


On Thu, Jul 14, 2005 at 04:25:12PM +0200, Peter Osterlund wrote:
> Linus Torvalds <torvalds@xxxxxxxx> writes:
>
> > On Wed, 13 Jul 2005, Jan Engelhardt wrote:
> > >
> > > No, some kernel code causes a triple-fault-and-reboot when the HZ is >=
> > > 10KHz. Maybe the highest possible value is 8192 Hz, not sure.
> >
> > Can you post the triple-fault message? It really shouldn't triple-fault,
> > although it _will_ obviously spend all time just doing timer interrupts,
> > so it shouldn't get much (if any) real work done either.
> ...
> > There should be no conceptual "highest possible HZ", although there are
> > certainly obvious practical limits to it (both on the timer hw itself, and
> > just the fact that at some point we'll spend all time on the timer
> > interrupt and won't get anything done..)
>
> HZ=10000 appears to work fine here after some hacks to avoid
> over/underflows in integer arithmetics. gkrellm reports about 3-4% CPU
> usage when the system is idle, on a 3.07 GHz P4.

yep, we've gone up to 20kHz actually, but this
requires some changes to long lasting network
timeouts :)

nevertheless 20Hz-20kHz works fine on 'most'
archs ...

best,
Herbert

> ---
>
> Makefile | 2 +-
> arch/i386/kernel/cpu/proc.c | 6 ++++++
> fs/nfsd/nfssvc.c | 2 +-
> include/linux/jiffies.h | 6 ++++++
> include/linux/nfsd/stats.h | 4 ++++
> include/linux/timex.h | 2 +-
> include/net/tcp.h | 12 +++++++++---
> init/calibrate.c | 21 +++++++++++++++++++++
> kernel/Kconfig.hz | 6 ++++++
> kernel/timer.c | 4 ++--
> net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 2 +-
> 11 files changed, 58 insertions(+), 9 deletions(-)
>
> diff --git a/Makefile b/Makefile
> --- a/Makefile
> +++ b/Makefile
> @@ -1,7 +1,7 @@
> VERSION = 2
> PATCHLEVEL = 6
> SUBLEVEL = 13
> -EXTRAVERSION =-rc3
> +EXTRAVERSION =-rc3-test
> NAME=Woozy Numbat
>
> # *DOCUMENTATION*
> diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
> --- a/arch/i386/kernel/cpu/proc.c
> +++ b/arch/i386/kernel/cpu/proc.c
> @@ -128,9 +128,15 @@ static int show_cpuinfo(struct seq_file
> x86_cap_flags[i] != NULL )
> seq_printf(m, " %s", x86_cap_flags[i]);
>
> +#if HZ <= 5000
> seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
> c->loops_per_jiffy/(500000/HZ),
> (c->loops_per_jiffy/(5000/HZ)) % 100);
> +#else
> + seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
> + c->loops_per_jiffy/(500000/HZ),
> + (c->loops_per_jiffy*(HZ/5000)) % 100);
> +#endif
>
> return 0;
> }
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -160,7 +160,7 @@ update_thread_usage(int busy_threads)
> decile = busy_threads*10/nfsdstats.th_cnt;
> if (decile>0 && decile <= 10) {
> diff = nfsd_last_call - prev_call;
> - if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
> + if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
> nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
> if (decile == 10)
> nfsdstats.th_fullcnt++;
> diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
> --- a/include/linux/jiffies.h
> +++ b/include/linux/jiffies.h
> @@ -38,6 +38,12 @@
> # define SHIFT_HZ 9
> #elif HZ >= 768 && HZ < 1536
> # define SHIFT_HZ 10
> +#elif HZ >= 1536 && HZ < 3072
> +# define SHIFT_HZ 11
> +#elif HZ >= 3072 && HZ < 6144
> +# define SHIFT_HZ 12
> +#elif HZ >= 6144 && HZ < 12288
> +# define SHIFT_HZ 13
> #else
> # error You lose.
> #endif
> diff --git a/include/linux/nfsd/stats.h b/include/linux/nfsd/stats.h
> --- a/include/linux/nfsd/stats.h
> +++ b/include/linux/nfsd/stats.h
> @@ -30,7 +30,11 @@ struct nfsd_stats {
> };
>
> /* thread usage wraps very million seconds (approx one fortnight) */
> +#if HZ < 2048
> #define NFSD_USAGE_WRAP (HZ*1000000)
> +#else
> +#define NFSD_USAGE_WRAP (2048*1000000)
> +#endif
>
> #ifdef __KERNEL__
>
> diff --git a/include/linux/timex.h b/include/linux/timex.h
> --- a/include/linux/timex.h
> +++ b/include/linux/timex.h
> @@ -90,7 +90,7 @@
> *
> * FINENSEC is 1 ns in SHIFT_UPDATE units of the time_phase variable.
> */
> -#define SHIFT_SCALE 22 /* phase scale (shift) */
> +#define SHIFT_SCALE 25 /* phase scale (shift) */
> #define SHIFT_UPDATE (SHIFT_KG + MAXTC) /* time offset scale (shift) */
> #define SHIFT_USEC 16 /* frequency offset scale (shift) */
> #define FINENSEC (1L << (SHIFT_SCALE - 10)) /* ~1 ns in phase units */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -486,8 +486,8 @@ static __inline__ int tcp_sk_listen_hash
> so that we select tick to get range about 4 seconds.
> */
>
> -#if HZ <= 16 || HZ > 4096
> -# error Unsupported: HZ <= 16 or HZ > 4096
> +#if HZ <= 16
> +# error Unsupported: HZ <= 16
> #elif HZ <= 32
> # define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
> #elif HZ <= 64
> @@ -502,8 +502,14 @@ static __inline__ int tcp_sk_listen_hash
> # define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
> #elif HZ <= 2048
> # define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
> -#else
> +#elif HZ <= 4096
> # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
> +#elif HZ <= 8192
> +# define TCP_TW_RECYCLE_TICK (13+2-TCP_TW_RECYCLE_SLOTS_LOG)
> +#elif HZ <= 16384
> +# define TCP_TW_RECYCLE_TICK (14+2-TCP_TW_RECYCLE_SLOTS_LOG)
> +#else
> +# error Unsupported: HZ > 16384
> #endif
> /*
> * TCP option
> diff --git a/init/calibrate.c b/init/calibrate.c
> --- a/init/calibrate.c
> +++ b/init/calibrate.c
> @@ -119,16 +119,30 @@ void __devinit calibrate_delay(void)
>
> if (preset_lpj) {
> loops_per_jiffy = preset_lpj;
> +#if HZ <= 5000
> printk("Calibrating delay loop (skipped)... "
> "%lu.%02lu BogoMIPS preset\n",
> loops_per_jiffy/(500000/HZ),
> (loops_per_jiffy/(5000/HZ)) % 100);
> +#else
> + printk("Calibrating delay loop (skipped)... "
> + "%lu.%02lu BogoMIPS preset\n",
> + loops_per_jiffy/(500000/HZ),
> + (loops_per_jiffy*(HZ/5000)) % 100);
> +#endif
> } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
> printk("Calibrating delay using timer specific routine.. ");
> +#if HZ <= 5000
> printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
> loops_per_jiffy/(500000/HZ),
> (loops_per_jiffy/(5000/HZ)) % 100,
> loops_per_jiffy);
> +#else
> + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
> + loops_per_jiffy/(500000/HZ),
> + (loops_per_jiffy*(HZ/5000)) % 100,
> + loops_per_jiffy);
> +#endif
> } else {
> loops_per_jiffy = (1<<12);
>
> @@ -164,10 +178,17 @@ void __devinit calibrate_delay(void)
> }
>
> /* Round the value and print it */
> +#if HZ <= 5000
> printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
> loops_per_jiffy/(500000/HZ),
> (loops_per_jiffy/(5000/HZ)) % 100,
> loops_per_jiffy);
> +#else
> + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
> + loops_per_jiffy/(500000/HZ),
> + (loops_per_jiffy*(HZ/5000)) % 100,
> + loops_per_jiffy);
> +#endif
> }
>
> }
> diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
> --- a/kernel/Kconfig.hz
> +++ b/kernel/Kconfig.hz
> @@ -36,6 +36,11 @@ choice
> 1000 HZ is the preferred choice for desktop systems and other
> systems requiring fast interactive responses to events.
>
> + config HZ_10000
> + bool "10000 HZ"
> + help
> + 10000 HZ is for testing only.
> +
> endchoice
>
> config HZ
> @@ -43,4 +48,5 @@ config HZ
> default 100 if HZ_100
> default 250 if HZ_250
> default 1000 if HZ_1000
> + default 10000 if HZ_10000
>
> diff --git a/kernel/timer.c b/kernel/timer.c
> --- a/kernel/timer.c
> +++ b/kernel/timer.c
> @@ -710,7 +710,7 @@ static void second_overflow(void)
> if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
> ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
> time_offset += ltemp;
> - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
> + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
> } else {
> ltemp = time_offset;
> if (!(time_status & STA_FLL))
> @@ -718,7 +718,7 @@ static void second_overflow(void)
> if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
> ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
> time_offset -= ltemp;
> - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
> + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
> }
>
> /*
> diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
> --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
> +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
> @@ -87,7 +87,7 @@ static const char *tcp_conntrack_names[]
>
> unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS;
> unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS;
> -unsigned long ip_ct_tcp_timeout_established = 5 DAYS;
> +unsigned long ip_ct_tcp_timeout_established = 2 DAYS;
> unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS;
> unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS;
> unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS;
>
> --
> Peter Osterlund - petero2@xxxxxxxxx
> http://web.telia.com/~u89404340
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/