[PATCH] Re: sqrt 10x slower in 2.0.7-19 than 2.0.6-9 ???

John Salmon (johns@cacr.caltech.edu)
Fri, 2 Oct 1998 16:45:54 -0700


To whom it may concern,

Here is a patch to glibc-2.0.7/sysdeps/alpha/fpu/e_sqrt.c that
recovers most of the performance lost when the 'fast' versions were
switched off due to erroneous values at or near DBL_MIN. Accuracy is
NOT compromised. Also included is a 'torture' test that verifies
results by testing the value returned by sqrt for several million
values throughout the entire range of double precision values. Of
course, this test is not exhaustive. Proving the correctness of
floating point functions is notoriously hard, but I believe this patch
is reliable.

I understand that there are 'fast libraries' out there, but I believe
that a tiny patch that can recover a factor of five in performance is
worth making part of glibc even if somebody else has a faster library.

The underlying cause of the problem is that the alpha does not really
fully support the IEEE-754 standard in hardware. In particular, the
'gradual underflow' provisions of the standard are meant to be
implemented by software which does not appear to be part of glibc yet.
This means that arithmetic on values at or near DBL_MIN can be
counter-intuitive.

In particular, the expression in e_sqrt.c:

y = y*(one_and_a_half - half*x*y*y);

first computes half*x, which results in 0.0 for x in the
problem-range! From then on, the calculation is garbage.

The problem is easy to repair. Simply reorder the multiplications.
The product x*y is properly normalized (it's nearly 1./sqrt(x)), so
rewrite the expression as:

y = y*((one_and_a_half - two_to_minus_30) - half*y*(x*y));

After fixing the above, I encountered an additional problem with a
floating point exception (underflow?) being thrown from the evaluation
of:
high = high - x;
for very small values of x. Another simple reorganization seems to
fix it (by comparing high with x, rather than high-x with 0).

I did not repair the assembly version of the code. I added additional
remarks to the descriptive comment instead.

Tests:

1 - sqrttorture.c (see below) tests millions of logarithmically
spaced arguments to sqrt(). The worst case appears to be 1ulb. The
mean error appears to be 0, and the mean absolute error appears to be
about 1/3 ulb when I run this test. These are exactly what one
expects. For example:

[johns@paranal glibc]$ sqrttorture 64
evaluated 185884154 sqrts in range 2.22507e-308 to 1.79769e+308
worst ratio: 1.00 ulb for x=1.1162e-103
mean error: -6.47361e-05 ulb
mean absolute error: 0.34 ulb
[johns@paranal glibc]$ sqrttorture 31.415926535
evaluated 378679352 sqrts in range 2.22507e-308 to 1.79769e+308
worst ratio: 1.00 ulb for x=1.45519e-11
mean error: 8.52193e-06 ulb
mean absolute error: 0.34 ulb

2 - sqrtloop.c calls sqrt in a timing loop. The performance is about
5 times the original 2.0.7 version. It's about half of the buggy 2.0.6
version:

[johns@paranal glibc]$ ./sqrtloop
1000000 sqrts in 0.4392 sec, 2.27687e+06 per sec

----------------------------patch--------------------
[johns@paranal BUILD]$ diff -u glibc-2.0.7/sysdeps/alpha/fpu/e_sqrt.c.orig glibc-2.0.7/sysdeps/alpha/fpu/e_sqrt.c
--- glibc-2.0.7/sysdeps/alpha/fpu/e_sqrt.c.orig Fri Oct 2 11:48:57 1998
+++ glibc-2.0.7/sysdeps/alpha/fpu/e_sqrt.c Fri Oct 2 15:25:16 1998
@@ -22,9 +22,7 @@
* We have three versions, depending on how exact we need the results.
*/

-/* Alternative versions are disabled because they currently don't work
- properly with and near DBL_MIN. */
-#if 1 || defined(_IEEE_FP) && defined(_IEEE_FP_INEXACT)
+#if defined(_IEEE_FP) && defined(_IEEE_FP_INEXACT)

/* Most demanding: go to the original source. */
#include <sysdeps/libm-ieee754/e_sqrt.c>
@@ -56,7 +54,9 @@
0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd }
};

-#ifdef _IEEE_FP
+/* Alternative versions are disabled because they currently don't work
+ properly with and near DBL_MIN. */
+#if 1 || defined(_IEEE_FP)
/*
* This version is much faster than the standard one included above,
* but it doesn't maintain the inexact flag.
@@ -110,10 +110,10 @@
y = initial_guess(x, k >> 32, ptr);
half = Double(__half);
one_and_a_half = Double(__one_and_a_half);
- y = y*(one_and_a_half - half*x*y*y);
+ y = y*(one_and_a_half - half*y*(x*y));
dn = Double(__dn);
two_to_minus_30 = Double(__two_to_minus_30);
- y = y*((one_and_a_half - two_to_minus_30) - half*x*y*y);
+ y = y*((one_and_a_half - two_to_minus_30) - half*y*(x*y));
up = Double(__up);
z = x*y;
one = Double(__one);
@@ -122,6 +122,9 @@
choppedmul(z,dn,zp);
choppedmul(z,up,zn);

+#if 0
+ /* This looks ok, but with x near DBL_MIN, the computation
+ of high-x seems to underflow and generate a SIGFPE. */
choppedmul(z,zp,low);
low = low - x;
choppedmul(z,zn,high);
@@ -134,6 +137,12 @@
__asm__("fcmovlt %2,%3,%0"
:"=f" (z)
:"0" (z), "f" (high), "f" (zn));
+#else
+ choppedmul(z,zp,low);
+ choppedmul(z,zn,high);
+ if( low >= x ) z = zp;
+ if( high < x ) z = zn;
+#endif
return z; /* Argh! gcc jumps to end here */

special:
@@ -159,7 +168,13 @@
#else
/*
* This version is much faster than generic sqrt implementation, but
- * it doesn't handle exceptional values or the inexact flag.
+ * it doesn't handle exceptional values or the inexact flag, or
+ * values in the range [DBL_MIN, 2.0*DBL_MIN). The last problem is
+ * due to the alpha not performing gradual underflow as specified by IEEE,
+ * so when 0.5*x is computed and stored in %f11, the result is 0, which
+ * sends the rest of the calculation completely awry! This
+ * could be repaired in several ways, but what would be the point unless
+ * the other problems are resolved too.
*/

asm ("\
[johns@paranal BUILD]$
-------------------------------

---------------sqrttorture.c-------------
/* Usage:
sqrttorture fpvalue
ratio of successive tests will be 1.0 + fpvalue*FLT_EPSILON
fpvalue defaults to 64.0, which takes a couple of minutes on a 21164
*/
#include <math.h>
#include <float.h>
#include <limits.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

#define FACTOR (1.0 + 64.*(FLT_EPSILON))
#define XMIN (DBL_MIN)
#define XMAX (DBL_MAX) /* DBL_MAX */

int main(int argc, char **argv){
double x, resid, sqrtx;
double factor, xlast;
double ratio, worstratio, worstx;
/* Assume longs are 8 bytes. */
unsigned long int ix;
unsigned long int ir;
double sum, sumabs;
int nsamples;

if( argc > 1 ){
factor = 1.0 + (atof(argv[1]) * FLT_EPSILON);
}else{
factor = FACTOR;
}

worstratio = -1.;
sumabs = sum = 0.;
nsamples = 0;
xlast = XMAX/(factor*1.001);
for(x = XMIN; x<xlast; x *= factor){
sqrtx = sqrt(x);
resid = x - sqrtx*sqrtx;
ratio = (resid/x)/DBL_EPSILON;
sum += ratio;
sumabs += fabs(ratio);
nsamples++;
if( fabs(ratio) > worstratio ){
worstratio = fabs(ratio);
worstx = x;
}
}
/* Finish off by checking XMAX, just for fun. */
x = XMAX;
sqrtx = sqrt(x);
resid = x - sqrtx*sqrtx;
ratio = (resid/x)/DBL_EPSILON;
sum += ratio;
sumabs += fabs(ratio);
nsamples++;
if( fabs(ratio) > worstratio ){
worstratio = fabs(ratio);
worstx = x;
}

fprintf(stdout, "evaluated %d sqrts in range %g to %g\n", nsamples, XMIN, XMAX);
fprintf(stdout, "worst ratio: %.2f ulb for x=%g\n", worstratio, worstx);
fprintf(stdout, "mean error: %g ulb\n", sum/nsamples);
fprintf(stdout, "mean absolute error: %.2f ulb\n", sumabs/nsamples);
return 0;
}
-----------------------------
---------------sqrtloop.c--------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define NTIMES 1000000

int main(int argc, char **argv){
int i = NTIMES;
double x = 0.7;
clock_t tstart, tend;
double seconds;

tstart = clock();
while(--i){
x = sqrt(x) + 0.5;
}
tend = clock();

seconds = ((double)(tend-tstart))/CLOCKS_PER_SEC;
printf("%d sqrts in %g sec, %g per sec\n",
NTIMES, seconds, NTIMES/seconds);

return 0;
}