Re: [RFC] div64_64 support

From: Stephen Hemminger
Date: Tue Mar 06 2007 - 13:34:39 EST


Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.

Slightly gross test program (with original cubic wraparound bug fixed).

---
/* Test and measure perf of cube root algorithms. */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#ifdef __x86_64

#define rdtscll(val) do { \
unsigned int __a,__d; \
asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({ \
uint32_t __base = (base); \
uint32_t __rem; \
__rem = ((uint64_t)(n)) % __base; \
(n) = ((uint64_t)(n)) / __base; \
__rem; \
})


/**
* __ffs - find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static __inline__ unsigned long __ffs(unsigned long word)
{
__asm__("bsfq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}

/*
* __fls: find last bit set.
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long __fls(unsigned long word)
{
__asm__("bsrq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}

/**
* ffs - find first bit set
* @x: the word to search
*
* This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from the above ffz (man ffs).
*/
static __inline__ int ffs(int x)
{
int r;

__asm__("bsfl %1,%0\n\t"
"cmovzl %2,%0"
: "=r" (r) : "rm" (x), "r" (-1));
return r+1;
}

/**
* fls - find last bit set
* @x: the word to search
*
* This is defined the same way as ffs.
*/
static inline int fls(int x)
{
int r;

__asm__("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
return r+1;
}

/**
* fls64 - find last bit set in 64 bit word
* @x: the word to search
*
* This is defined the same way as fls.
*/
static inline int fls64(uint64_t x)
{
if (x == 0)
return 0;
return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
__asm__ __volatile__("rdtsc" : "=A" (val))

/**
* ffs - find first bit set
* @x: the word to search
*
* This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from the above ffz() (man ffs).
*/
static inline int ffs(int x)
{
int r;

__asm__("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r+1;
}

/**
* fls - find last bit set
* @x: the word to search
*
* This is defined the same way as ffs().
*/
static inline int fls(int x)
{
int r;

__asm__("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r+1;
}

static inline int fls64(uint64_t x)
{
uint32_t h = x >> 32;
if (h)
return fls(h) + 32;
return fls(x);
}


#define do_div(n,base) ({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
asm("":"=a" (__low), "=d" (__high):"A" (n)); \
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
asm("":"=A" (n):"a" (__low),"d" (__high)); \
__mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
uint32_t d = divisor;

if (divisor > 0xffffffffULL) {
unsigned int shift = fls(divisor >> 32);

d = divisor >> shift;
dividend >>= shift;
}

/* avoid 64 bit division if possible */
if (dividend >> 32)
do_div(dividend, d);
else
dividend = (uint32_t) dividend / d;

return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
uint32_t y = 0;
int s;

for (s = 63; s >= 0; s -= 3) {
uint64_t b, bs;

y = 2 * y;
b = 3 * y * (y+1) + 1;
bs = b << s;
if (x >= bs && (b == (bs>>s))) { /* avoid overflow */
x -= bs;
y++;
}
}
return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
int s = 60;
uint32_t y = 0;

do {
uint64_t b;
y = 2*y;
b = (uint64_t)(3*y*(y + 1) + 1) << s;
s = s - 3;
if (x >= b) {
x = x - b;
y = y + 1;
}
} while(s >= 0);

return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
uint32_t x, x1;

/* Initial estimate is based on:
* cbrt(x) = exp(log(x) / 3)
*/
x = 1u << (fls64(a)/3);

/*
* Iteration based on:
* 2
* x = ( 2 * x + a / x ) / 3
* k+1 k k
*/
do {
uint64_t x2;

x2 = x;
x2 *= x;
x1 = x;

x = (2 * x + div64_64(a, x2)) / 3;
} while (abs(x1 - x) > 1);

return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
uint64_t x;

/* Initial estimate is based on:
* cbrt(x) = exp(log(x) / 3)
*/
x = 1u << (fls64(a)/3);

/* Converges in 3 iterations to > 32 bits */

x = (2 * x + div64_64(a, x*x)) / 3;
x = (2 * x + div64_64(a, x*x)) / 3;
x = (2 * x + div64_64(a, x*x)) / 3;

return x;
}

static const struct cbrt {
uint64_t in;
uint32_t result;
} cases[] = {
{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1},
{8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2},
{15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2},
{23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3},
{31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3},
{39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4},
{ 125ull, 5 }, { 216ull, 6 }, { 343ull, 7 }, { 512ull, 8 },
{ 1000ull, 10 }, { 1331ull, 11 },
{ 8000ull, 20 }, { 9261ull, 21 },
{32767, 31}, {32768, 32}, {32769, 32},
{ 64000ull, 40 }, { 68921ull, 41 },
{ 512000ull, 80 }, { 531441ull, 81 },
{ 1000000ull, 100 }, { 1030301ull, 101 },
{ 4096000ull, 160 }, { 4173281ull, 161 },
{ 16387064ull, 254 }, { 16581375ull, 255 },
{ 16777216ull, 256 }, { 16974593ull, 257 },
{ 131096512ull, 508 }, { 131872229ull, 509 },
{ 132651000ull, 510 }, { 133432831ull, 511 },
{ 134217728ull, 512 }, { 135005697ull, 513 },
{ 1000000000ull, 1000 }, { 1003003001ull, 1001 },
{ 1006012008ull, 1002 }, { 1009027027ull, 1003 },
{ 1061208000ull, 1020 }, { 1064332261ull, 1021 },
{ 1067462648ull, 1022 }, { 1070599167ull, 1023 },

{1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024},
{~0, 2097151},
/* 100 random values */
{ 7749363893351949254ull, 1978891}, { 7222815480849057907ull, 1933016},
{ 8408462745175416063ull, 2033475}, { 3091884191388096748ull, 1456826},
{ 2562019500164152525ull, 1368340}, { 4403210617922443179ull, 1639041},
{ 3364542905362882299ull, 1498449}, { 8782769017716072774ull, 2063211},
{ 5863405773976003266ull, 1803225}, { 1306053050111174648ull, 1093084},
{ 150346236956174824ull, 531737}, { 1265737889039205261ull, 1081719},
{ 1445109530774087002ull, 1130577}, { 1197105577171186275ull, 1061803},
{ 9213452462461015967ull, 2096399}, { 4730966302945445786ull, 1678739},
{ 5650605098630667570ull, 1781141}, { 5880381756353009591ull, 1804963},
{ 4552499520046621784ull, 1657359}, { 2697991130065918298ull, 1392131},
{ 4858364911220984157ull, 1693674}, { 3691457481531040535ull, 1545489},
{ 2613117305472506601ull, 1377377}, { 7449943749836318932ull, 1953069},
{ 643378865959570610ull, 863287}, { 4851450802679832774ull, 1692871},
{ 1772859812839988916ull, 1210295}, { 8210946489571640849ull, 2017426},
{ 591875965497384322ull, 839608}, { 4221553402965100097ull, 1616183},
{ 2197744667347238205ull, 1300146}, { 8321400714356781191ull, 2026432},
{ 2459557415995497961ull, 1349850}, { 3460673533926954145ull, 1512586},
{ 4727304344741345505ull, 1678306}, { 4903203917250634599ull, 1698869},
{ 4036494370831490817ull, 1592214}, { 8585205035691420311ull, 2047624},
{ 2622143824319236828ull, 1378961}, { 5902762718897731478ull, 1807250},
{ 6344401509618197560ull, 1851243}, { 4059247793194552874ull, 1595200},
{ 7648030174294342832ull, 1970228}, { 2111858627070002939ull, 1282985},
{ 3231502273651985583ull, 1478432}, { 8821862535190318932ull, 2066268},
{ 6062559696943389464ull, 1823414}, { 4054224670122353756ull, 1594541},
{ 3674929609692563482ull, 1543179}, { 6310802012126231363ull, 1847969},
{ 4450190829039920890ull, 1644849}, { 8764531173541462842ull, 2061782},
{ 1361923252301505833ull, 1108453}, { 5912924843615600614ull, 1808287},
{ 5714768882048811324ull, 1787857}, { 7249589769047033748ull, 1935401},
{ 4123157012528828376ull, 1603528}, { 1729687638268160097ull, 1200390},
{ 5132287771298228729ull, 1724925}, { 1564349257200314043ull, 1160854},
{ 951586254223522969ull, 983594}, { 4569664949094662293ull, 1659439},
{ 9082730968228181483ull, 2086437}, { 6312891027251024051ull, 1848173},
{ 6915415788559031791ull, 1905194}, { 2713150456497618688ull, 1394733},
{ 5390954890749602465ull, 1753430}, { 1405547745908296421ull, 1120164},
{ 1157301728707637259ull, 1049902}, { 1513573187112042448ull, 1148156},
{ 687416080475161159ull, 882551}, { 484496930861389501ull, 785411},
{ 1625256440396143907ull, 1175729}, { 7358388240824901288ull, 1945035},
{ 6055730836615196283ull, 1822729}, { 5897962221937294789ull, 1806760},
{ 862205218853780339ull, 951780}, { 4798091009445823173ull, 1686641},
{ 644772714391937867ull, 863910}, { 4255852691293155171ull, 1620549},
{ 5287931004512034672ull, 1742188}, { 479051048987854372ull, 782457},
{ 9223312736680112286ull, 2097147}, { 8208392001457969628ull, 2017217},
{ 9203071384420047828ull, 2095612}, { 8029313043584389618ull, 2002439},
{ 38384068872053008ull, 337326}, { 5477688516749455419ull, 1762784},
{ 1504622508868036557ull, 1145888}, { 8421184723110053200ull, 2034500},
{ 3312070181890020423ull, 1490618}, { 5344298403762143580ull, 1748357},
{ 6340030040222269807ull, 1850818}, { 4895839553118470425ull, 1698018},
{ 2806627376195262363ull, 1410570}, { 5321619225005368821ull, 1745880},
{ 6897323351052656353ull, 1903532}, { 326700202259382556ull, 688731},
{ 7685269066741890339ull, 1973420}, { 8054506481558450217ull, 2004531},
};
#define NCASES (sizeof(cases)/sizeof(cases[0]))


static double ticks_per_usec;

static void show(const char *func, uint64_t sum, uint64_t sq,
unsigned long long mx, unsigned long err)
{
double mean, std;

mean = (double) sum / ticks_per_usec / NCASES ;
std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean);

printf("%-10s %8llu %8.2f %8.2f %8.2f %lu\n", func,
(unsigned long long) sum / NCASES, mean, std,
(double) mx / ticks_per_usec, err);
}


int main(int argc, char **argv)
{
int i;
uint32_t c;
unsigned long long start, end, t, mx;
unsigned long long err, sum, sum_sq;

rdtscll(start);
sleep(2);
rdtscll(end);
ticks_per_usec = (double) (end - start) / 2000000.;
printf("Function clocks mean(us) max(us) std(us) total error\n");

#define DOTEST(func) \
if (func(27) != 3) printf("ouch\n"); \
sum = sum_sq = mx = 0; \
for (err = 0, i = 0; i < NCASES; i++) { \
rdtscll(start); \
c = func(cases[i].in); \
rdtscll(end); \
t = end - start; sum += t; sum_sq += t*t; \
if (t > mx) mx = t; \
err += abs((long) cases[i].result - c); \
} \
show(#func, sum, sum_sq, mx, err);

DOTEST(ocubic);
DOTEST(ncubic);
DOTEST(acbrt);
DOTEST(hcbrt);
return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/