Re: [RFC] div64_64 support
From: Sami Farin
Date: Wed Mar 07 2007 - 13:33:25 EST
On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
> Sami Farin wrote:
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> >> when doing 1000 loops test... gcc-4.0.3 works.
> >
> > Found it.
> >
> > --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >
> > __asm__("bsrl %1,%0\n\t"
> > "cmovzl %2,%0"
> > - : "=&r" (r) : "rm" (x), "rm" (-1));
> > + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> > return r+1;
> > }
> >
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
>
> Can you post the difference in the generated code with that change?
Fun.. looks when not using "memory" gcc does not even bother
calling ncubic() 666 times. So it gets better timings ( 42/666=0 ) =)
--- cbrt-test-no_memory.s 2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s 2007-03-07 20:22:38.237013197 +0200
...
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
- subl $136, %esp
+ subl $152, %esp
movl $.LC0, (%esp)
call puts
xorl %edx, %edx
movl $27, %eax
call ncubic
cmpl $3, %eax
- je .L83
+ je .L87
movl $.LC1, (%esp)
call puts
-.L83:
- xorl %eax, %eax
- xorl %edi, %edi
- movl %eax, 88(%esp)
+.L87:
xorl %eax, %eax
- xorl %esi, %esi
+ xorl %ebp, %ebp
movl %eax, 92(%esp)
xorl %eax, %eax
- xorl %ebp, %ebp
+ xorl %edi, %edi
movl %eax, 96(%esp)
xorl %eax, %eax
+ xorl %esi, %esi
movl %eax, 100(%esp)
xorl %eax, %eax
movl %eax, 104(%esp)
xorl %eax, %eax
movl %eax, 108(%esp)
- movl %edi, 112(%esp)
- movl %esi, 116(%esp)
- .p2align 4,,15
-.L84:
+ xorl %eax, %eax
+ movl %eax, 112(%esp)
+ movl %ebp, 116(%esp)
+ movl %edi, 120(%esp)
+ movl %esi, 124(%esp)
+.L88:
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
movl %eax, 56(%esp)
movl %edx, 60(%esp)
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
movl %eax, %esi
movl %edx, %edi
subl 56(%esp), %esi
sbbl 60(%esp), %edi
cmpl $0, %edi
ja .L66
cmpl $999, %esi
- jbe .L84
+ jbe .L88
.L66:
+ movl 92(%esp), %edx
+ leal (%edx,%edx,2), %eax
+ movl cases+4(,%eax,4), %edi
+ movl cases(,%eax,4), %esi
+ movl %edi, %edx
+ movl %esi, %eax
+ call ncubic
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
- movl %eax, %esi
- movl %edx, %edi
+ movl $666, %ebx
+ movl %eax, 128(%esp)
+ movl %edx, 132(%esp)
+ .p2align 4,,15
+.L67:
+ movl %esi, %eax
+ movl %edi, %edx
+ call ncubic
+ decl %ebx
+ movl %eax, %ebp
+ jne .L67
#APP
movl $0, %eax
cpuid
rdtsc
#NO_APP
- subl %esi, %eax
+ subl 128(%esp), %eax
movl $666, %ebx
- sbbl %edi, %edx
- xorl %ecx, %ecx
movl %ebx, 8(%esp)
+ sbbl 132(%esp), %edx
+ xorl %ecx, %ecx
movl %ecx, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
call __udivdi3
- addl %eax, 104(%esp)
+ addl %eax, 112(%esp)
movl %edx, %ecx
movl %eax, %ebx
movl %edx, %esi
- adcl %edx, 108(%esp)
+ adcl %edx, 116(%esp)
imull %eax, %ecx
mull %ebx
addl %ecx, %ecx
movl %eax, 56(%esp)
addl %ecx, %edx
movl 56(%esp), %eax
- addl %eax, 112(%esp)
+ addl %eax, 120(%esp)
movl %edx, 60(%esp)
movl 60(%esp), %edx
- adcl %edx, 116(%esp)
- cmpl %esi, 92(%esp)
- ja .L67
- jb .L68
- cmpl %ebx, 88(%esp)
- jae .L67
-.L68:
- movl %ebx, 88(%esp)
- movl %esi, 92(%esp)
-.L67:
- leal (%ebp,%ebp,2), %ebx
- sall $2, %ebx
- movl cases+4(%ebx), %edx
- movl cases(%ebx), %eax
- call ncubic
- movl cases+8(%ebx), %edx
- subl %eax, %edx
- movl %edx, %eax
- sarl $31, %eax
- xorl %eax, %edx
- subl %eax, %edx
- movl %edx, %ecx
- sarl $31, %ecx
- addl %edx, 96(%esp)
- adcl %ecx, 100(%esp)
- incl %ebp
- cmpl $183, %ebp
- jbe .L84
- movl 108(%esp), %eax
- fildll 104(%esp)
- testl %eax, %eax
- js .L85
+ adcl %edx, 124(%esp)
+ cmpl %esi, 100(%esp)
+ ja .L69
+ jb .L70
+ cmpl %ebx, 96(%esp)
+ jae .L69
.L70:
- fstpl 120(%esp)
+ movl %ebx, 96(%esp)
+ movl %esi, 100(%esp)
+.L69:
+ movl 92(%esp), %edx
+ leal (%edx,%edx,2), %eax
+ movl cases+8(,%eax,4), %eax
+ subl %ebp, %eax
+ movl %eax, %ecx
+ sarl $31, %ecx
+ xorl %ecx, %eax
+ subl %ecx, %eax
+ cltd
+ addl %eax, 104(%esp)
+ adcl %edx, 108(%esp)
+ incl 92(%esp)
+ cmpl $183, 92(%esp)
+ jbe .L88
movl 116(%esp), %eax
- fldl 120(%esp)
+ fildll 112(%esp)
+ testl %eax, %eax
+ js .L89
+.L72:
+ fstpl 136(%esp)
+ movl 124(%esp), %eax
+ fldl 136(%esp)
fdivl .LC7
testl %eax, %eax
flds .LC4
fdivr %st, %st(1)
- fildll 112(%esp)
- js .L86
-.L71:
- fstpl 120(%esp)
- fldl 120(%esp)
+ fildll 120(%esp)
+ js .L90
+.L73:
+ fstpl 136(%esp)
+ fldl 136(%esp)
fdivl .LC7
fdivp %st, %st(1)
fld %st(1)
fmul %st(2), %st
fsubrp %st, %st(1)
fld %st(0)
fsqrt
fucomi %st(0), %st
- jp .L88
- je .L89
-.L88:
+ jp .L92
+ je .L93
+.L92:
fstp %st(0)
fstpl (%esp)
fstpl 64(%esp)
call sqrt
fldl 64(%esp)
fxch %st(1)
-.L72:
- movl 96(%esp), %eax
- movl 100(%esp), %edx
- fildll 88(%esp)
+.L74:
+ movl 104(%esp), %eax
+ movl 108(%esp), %edx
+ fildll 96(%esp)
movl %eax, 40(%esp)
- movl 92(%esp), %eax
+ movl 100(%esp), %eax
movl %edx, 44(%esp)
testl %eax, %eax
- js .L87
-.L73:
- fstpl 120(%esp)
- movl 104(%esp), %eax
+ js .L91
+.L75:
+ fstpl 136(%esp)
+ movl 112(%esp), %eax
movl $184, %ebp
- fldl 120(%esp)
+ fldl 136(%esp)
xorl %edi, %edi
movl $.LC5, %esi
fdivl .LC7
- movl 108(%esp), %edx
+ movl 116(%esp), %edx
movl %ebp, 8(%esp)
movl %edi, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
fstpl 32(%esp)
fstpl 24(%esp)
fstpl 16(%esp)
call __udivdi3
movl %esi, 4(%esp)
movl $.LC6, (%esp)
movl %eax, 8(%esp)
movl %edx, 12(%esp)
call printf
- addl $136, %esp
+ addl $152, %esp
xorl %eax, %eax
popl %ecx
popl %ebx
popl %esi
popl %edi
popl %ebp
leal -4(%ecx), %esp
ret
-.L89:
+.L93:
fstp %st(1)
+ jmp .L74
+.L89:
+ fadds .LC2
jmp .L72
-.L85:
+.L91:
fadds .LC2
- jmp .L70
-.L87:
+ jmp .L75
+.L90:
fadds .LC2
jmp .L73
-.L86:
- fadds .LC2
- jmp .L71
.size main, .-main
.section .rodata
.align 32
.type cases, @object
.size cases, 2208
cases:
...
--
/*
Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.
Slightly gross test program (with original cubic wraparound bug fixed).
---
*/
/* Test and measure perf of cube root algorithms. */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#define MEMCLOBBER 1
/**
* ffs - find first bit set
* @x: the word to search
*
* This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from the above ffz (man ffs).
*/
static inline int ffs(int x)
{
int r;
__asm__("bsfl %1,%0\n\t"
"cmovzl %2,%0"
#if MEMCLOBBER
: "=&r" (r) : "rm" (x), "r" (-1) : "memory");
#else
: "=&r" (r) : "rm" (x), "r" (-1));
#endif
return r+1;
}
/**
* fls - find last bit set
* @x: the word to search
*
* This is defined the same way as ffs.
*/
static inline int fls(int x)
{
int r;
__asm__("bsrl %1,%0\n\t"
"cmovzl %2,%0"
#if MEMCLOBBER
: "=&r" (r) : "rm" (x), "r" (-1) : "memory" );
#else
: "=&r" (r) : "rm" (x), "r" (-1));
#endif
return r+1;
}
#ifdef __x86_64
#define rdtscll(val) do { \
unsigned int __a,__d; \
asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)
# define do_div(n,base) ({ \
uint32_t __base = (base); \
uint32_t __rem; \
__rem = ((uint64_t)(n)) % __base; \
(n) = ((uint64_t)(n)) / __base; \
__rem; \
})
/**
* __ffs - find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
__asm__("bsfq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}
/*
* __fls: find last bit set.
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long __fls(unsigned long word)
{
__asm__("bsrq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}
/**
* fls64 - find last bit set in 64 bit word
* @x: the word to search
*
* This is defined the same way as fls.
*/
static inline int fls64(uint64_t x)
{
if (x == 0)
return 0;
return __fls(x) + 1;
}
static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
return dividend / divisor;
}
#elif __i386
#define rdtscll_start(val) \
__asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : "ebx", "ecx")
#define rdtscll(val) \
__asm__ __volatile__("rdtsc" : "=A" (val))
static inline int fls64(uint64_t x)
{
uint32_t h = x >> 32;
if (h)
return fls(h) + 32;
return fls(x);
}
#define do_div(n,base) ({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
asm("":"=a" (__low), "=d" (__high):"A" (n)); \
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
asm("":"=A" (n):"a" (__low),"d" (__high)); \
__mod; \
})
/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
uint32_t d = divisor;
if (divisor > 0xffffffffULL) {
unsigned int shift = fls(divisor >> 32);
d = divisor >> shift;
dividend >>= shift;
}
/* avoid 64 bit division if possible */
if (dividend >> 32)
do_div(dividend, d);
else
dividend = (uint32_t) dividend / d;
return dividend;
}
#endif
/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
uint32_t y = 0;
int s;
for (s = 63; s >= 0; s -= 3) {
uint64_t b, bs;
y = 2 * y;
b = 3 * y * (y+1) + 1;
bs = b << s;
if (x >= bs && (b == (bs>>s))) { /* avoid overflow */
x -= bs;
y++;
}
}
return y;
}
/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
int s = 60;
uint32_t y = 0;
do {
uint64_t b;
y = 2*y;
b = (uint64_t)(3*y*(y + 1) + 1) << s;
s = s - 3;
if (x >= b) {
x = x - b;
y = y + 1;
}
} while(s >= 0);
return y;
}
/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
uint32_t x, x1;
/* Initial estimate is based on:
* cbrt(x) = exp(log(x) / 3)
*/
x = 1u << (fls64(a)/3);
/*
* Iteration based on:
* 2
* x = ( 2 * x + a / x ) / 3
* k+1 k k
*/
do {
uint64_t x2;
x2 = x;
x2 *= x;
x1 = x;
x = (2 * x + div64_64(a, x2)) / 3;
} while (abs(x1 - x) > 1);
return x;
}
/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
uint64_t x;
/* Initial estimate is based on:
* cbrt(x) = exp(log(x) / 3)
*/
x = 1u << (fls64(a)/3);
/* Converges in 3 iterations to > 32 bits */
x = (2 * x + div64_64(a, x*x)) / 3;
x = (2 * x + div64_64(a, x*x)) / 3;
x = (2 * x + div64_64(a, x*x)) / 3;
return x;
}
static const struct cbrt {
uint64_t in;
uint32_t result;
} cases[] = {
{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1},
{8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2},
{15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2},
{23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3},
{31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3},
{39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4},
{ 125ull, 5 }, { 216ull, 6 }, { 343ull, 7 }, { 512ull, 8 },
{ 1000ull, 10 }, { 1331ull, 11 },
{ 8000ull, 20 }, { 9261ull, 21 },
{32767, 31}, {32768, 32}, {32769, 32},
{ 64000ull, 40 }, { 68921ull, 41 },
{ 512000ull, 80 }, { 531441ull, 81 },
{ 1000000ull, 100 }, { 1030301ull, 101 },
{ 4096000ull, 160 }, { 4173281ull, 161 },
{ 16387064ull, 254 }, { 16581375ull, 255 },
{ 16777216ull, 256 }, { 16974593ull, 257 },
{ 131096512ull, 508 }, { 131872229ull, 509 },
{ 132651000ull, 510 }, { 133432831ull, 511 },
{ 134217728ull, 512 }, { 135005697ull, 513 },
{ 1000000000ull, 1000 }, { 1003003001ull, 1001 },
{ 1006012008ull, 1002 }, { 1009027027ull, 1003 },
{ 1061208000ull, 1020 }, { 1064332261ull, 1021 },
{ 1067462648ull, 1022 }, { 1070599167ull, 1023 },
{1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024},
{~0, 2642245},
/* 100 random values */
{ 7749363893351949254ull, 1978891}, { 7222815480849057907ull, 1933016},
{ 8408462745175416063ull, 2033475}, { 3091884191388096748ull, 1456826},
{ 2562019500164152525ull, 1368340}, { 4403210617922443179ull, 1639041},
{ 3364542905362882299ull, 1498449}, { 8782769017716072774ull, 2063211},
{ 5863405773976003266ull, 1803225}, { 1306053050111174648ull, 1093084},
{ 150346236956174824ull, 531737}, { 1265737889039205261ull, 1081719},
{ 1445109530774087002ull, 1130577}, { 1197105577171186275ull, 1061803},
{ 9213452462461015967ull, 2096399}, { 4730966302945445786ull, 1678739},
{ 5650605098630667570ull, 1781141}, { 5880381756353009591ull, 1804963},
{ 4552499520046621784ull, 1657359}, { 2697991130065918298ull, 1392131},
{ 4858364911220984157ull, 1693674}, { 3691457481531040535ull, 1545489},
{ 2613117305472506601ull, 1377377}, { 7449943749836318932ull, 1953069},
{ 643378865959570610ull, 863287}, { 4851450802679832774ull, 1692871},
{ 1772859812839988916ull, 1210295}, { 8210946489571640849ull, 2017426},
{ 591875965497384322ull, 839608}, { 4221553402965100097ull, 1616183},
{ 2197744667347238205ull, 1300146}, { 8321400714356781191ull, 2026432},
{ 2459557415995497961ull, 1349850}, { 3460673533926954145ull, 1512586},
{ 4727304344741345505ull, 1678306}, { 4903203917250634599ull, 1698869},
{ 4036494370831490817ull, 1592214}, { 8585205035691420311ull, 2047624},
{ 2622143824319236828ull, 1378961}, { 5902762718897731478ull, 1807250},
{ 6344401509618197560ull, 1851243}, { 4059247793194552874ull, 1595200},
{ 7648030174294342832ull, 1970228}, { 2111858627070002939ull, 1282985},
{ 3231502273651985583ull, 1478432}, { 8821862535190318932ull, 2066268},
{ 6062559696943389464ull, 1823414}, { 4054224670122353756ull, 1594541},
{ 3674929609692563482ull, 1543179}, { 6310802012126231363ull, 1847969},
{ 4450190829039920890ull, 1644849}, { 8764531173541462842ull, 2061782},
{ 1361923252301505833ull, 1108453}, { 5912924843615600614ull, 1808287},
{ 5714768882048811324ull, 1787857}, { 7249589769047033748ull, 1935401},
{ 4123157012528828376ull, 1603528}, { 1729687638268160097ull, 1200390},
{ 5132287771298228729ull, 1724925}, { 1564349257200314043ull, 1160854},
{ 951586254223522969ull, 983594}, { 4569664949094662293ull, 1659439},
{ 9082730968228181483ull, 2086437}, { 6312891027251024051ull, 1848173},
{ 6915415788559031791ull, 1905194}, { 2713150456497618688ull, 1394733},
{ 5390954890749602465ull, 1753430}, { 1405547745908296421ull, 1120164},
{ 1157301728707637259ull, 1049902}, { 1513573187112042448ull, 1148156},
{ 687416080475161159ull, 882551}, { 484496930861389501ull, 785411},
{ 1625256440396143907ull, 1175729}, { 7358388240824901288ull, 1945035},
{ 6055730836615196283ull, 1822729}, { 5897962221937294789ull, 1806760},
{ 862205218853780339ull, 951780}, { 4798091009445823173ull, 1686641},
{ 644772714391937867ull, 863910}, { 4255852691293155171ull, 1620549},
{ 5287931004512034672ull, 1742188}, { 479051048987854372ull, 782457},
{ 9223312736680112286ull, 2097147}, { 8208392001457969628ull, 2017217},
{ 9203071384420047828ull, 2095612}, { 8029313043584389618ull, 2002439},
{ 38384068872053008ull, 337326}, { 5477688516749455419ull, 1762784},
{ 1504622508868036557ull, 1145888}, { 8421184723110053200ull, 2034500},
{ 3312070181890020423ull, 1490618}, { 5344298403762143580ull, 1748357},
{ 6340030040222269807ull, 1850818}, { 4895839553118470425ull, 1698018},
{ 2806627376195262363ull, 1410570}, { 5321619225005368821ull, 1745880},
{ 6897323351052656353ull, 1903532}, { 326700202259382556ull, 688731},
{ 7685269066741890339ull, 1973420}, { 8054506481558450217ull, 2004531},
};
#define NCASES (sizeof(cases)/sizeof(cases[0]))
static double ticks_per_usec = 2979.0;
static void show(const char *func, uint64_t sum, uint64_t sq,
unsigned long long mx, unsigned long long err)
{
double mean, std;
mean = (double) sum / ticks_per_usec / NCASES ;
std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean);
printf("%-10s %8llu %8.3f %8.3f %8.3f %11llu\n", func,
(unsigned long long) sum / NCASES, mean, std,
(double) mx / ticks_per_usec, err);
}
int main(int argc, char **argv)
{
int i;
uint32_t c;
unsigned long long start, end, t, mx;
unsigned long long err, sum, sum_sq, cerr;
int loops;
#if 0
rdtscll(start);
sleep(2);
rdtscll(end);
ticks_per_usec = (double) (end - start) / 2000000.;
#endif
printf("Function clocks mean(us) max(us) std(us) total error\n");
#define DOTEST(func) \
if (func(27) != 3) printf("ouch\n"); \
sum = sum_sq = mx = 0; \
for (err = 0, i = 0; i < NCASES; i++) { \
do { \
rdtscll_start(start); \
rdtscll_start(end); \
} while ((end - start) < 1000); \
c = func(cases[i].in); \
loops = 667; \
rdtscll_start(start); \
while (--loops > 0) c = func(cases[i].in); \
rdtscll_start(end); \
t = (end - start) / 666; sum += t; sum_sq += t*t; \
if (t > mx) mx = t; \
cerr = abs((long) cases[i].result - c); \
err += cerr; \
} \
show(#func, sum, sum_sq, mx, err);
//DOTEST(ocubic);
DOTEST(ncubic);
//DOTEST(acbrt);
//DOTEST(hcbrt);
return 0;
}