you are quite right - I ran my small tests on an IBM RS/6K-580 with
all floats turned into doubles and the performance did improve quite
a bit - not so on the P6 though for tests 1 and 2, 3 is unaffected.
As you suggested I mail the code for general consumption - the test
that barfs, is test 2.
Arno
/*
Test 2 is the one that would not finish on a 21164-300 under linux.
Run on PentiumPro 200 MHz under linux 2.0.0
Test 1: td 10
1000 10 0.52 153.85
Test 2: tr 10
Transf Benchmark: loop length : 10 * 10^3
Transf Benchmark: Time/MegaFlops: 1.44 125.00
Test 3: tv 10
Size Loops Time MFlops
500 40000 2.30 86.96
1000 20000 2.26 88.50
5000 4000 2.28 87.72
10000 2000 2.31 86.58
50000 400 3.64 54.95
100000 200 3.94 50.76
500000 40 4.09 48.90
======
Run on IBM RS/6000-580 with xlc -O3, no code changes:
Test 1: td 10
1000 10 2.89 27.68
Test 2: tr 10
Transf Benchmark: loop length : 10 * 10^3
Transf Benchmark: Time/MegaFlops: 7.71 23.35
Test 3: tv 10
Size Loops Time MFlops
500 40000 9.37 21.34
1000 20000 9.41 21.25
5000 4000 9.37 21.34
10000 2000 9.47 21.12
50000 400 9.43 21.21
100000 200 9.56 20.92
500000 40 9.89 20.22
*/
/* Same as above, but all float converted to double:
P6-200 linux 2.0.0 : gcc -O
1000 10 1.91 41.88
Transf Benchmark: loop length : 10 * 10^3
Transf Benchmark: Time/MegaFlops: 9.97 18.05
Size Loops Time MFlops
500 40000 2.33 85.84
1000 20000 2.54 78.74
5000 4000 3.28 60.98
10000 2000 3.07 65.15
50000 400 5.75 34.78
100000 200 5.75 34.78
500000 40 5.57 35.91
================
IBM RS/6000-580: xlc -O3
1000 10 1.33 60.15
Transf Benchmark: loop length : 10 * 10^3
Transf Benchmark: Time/MegaFlops: 3.28 54.88
Size Loops Time MFlops
500 40000 5.65 35.40
1000 20000 5.64 35.46
5000 4000 5.82 34.36
10000 2000 6.01 33.28
50000 400 6.55 30.53
100000 200 6.23 32.10
500000 40 6.13 32.63
*/
/* This is Test1 */
#include <stdio.h>
#ifdef linux
#define Hz 1.e-2
#else
#define Hz 1.e-6
#endif
extern long int clock (void);
float
second ()
{
return (Hz) * clock ();
}
void
report (int nx, int lx, float op, float tm)
{
float mf = -1.;
if (tm > 0.)
mf = op / tm;
printf ("%10d%10d%10.2f%10.2f\n", nx, lx, tm, mf);
}
extern void dist (float *, float *, float *, int);
#define NM 1000
void
main (int argc, char *argv[])
{
int i, j, lx;
float op, tm;
float u[3 * NM], v[3 * NM], d[NM];
for (i = 0; i < 3 * NM; i += 3)
{
u[i+0] = (i%3) + 1;
v[i+0] = 4 - u[i+0];
u[i+1] = (i%3) + 2;
v[i+1] = 4 - u[i+1];
u[i+2] = (i%3) + 3;
v[i+2] = 4 - u[i+2];
}
lx = 1;
if (argc > 1)
sscanf (argv[1], "%d", &lx);
tm = second ();
for (i = 0; i < lx; i++)
for (j = 0; j < 3 * NM; j += 3)
dist (u, v + j, d, NM);
tm = second () - tm;
op = (1.e-3 * NM) * (8 * NM) * (1.e-3 * lx);
report (NM, lx, op, tm);
}
void
dist (float *u, float *v, float *d, int n)
{
int i, i3, m;
float v0, v1, v2, t0, t1, t2, t3, t4, t5;
v0 = v[0];
v1 = v[1];
v2 = v[2];
m = n & ~3L;
for (i = i3 = 0; i < m; i += 4, i3 += 12)
{
t0 = u[i3 + 0] - v0;
t1 = u[i3 + 1] - v1;
t2 = u[i3 + 2] - v2;
t0 *= t0;
t1 *= t1;
t0 += t1;
t3 = u[i3 + 3] - v0;
t2 *= t2;
t0 += t2;
t4 = u[i3 + 4] - v1;
t5 = u[i3 + 5] - v2;
t3 *= t3;
t4 *= t4;
t3 += t4;
t5 *= t5;
t3 += t5;
d[i] = t0;
t0 = u[i3 + 6] - v0;
t1 = u[i3 + 7] - v1;
t2 = u[i3 + 8] - v2;
d[i + 1] = t3;
t0 *= t0;
t1 *= t1;
t0 += t1;
t2 *= t2;
t0 += t2;
t3 = u[i3 + 9] - v0;
t4 = u[i3 + 10] - v1;
t5 = u[i3 + 11] - v2;
t3 *= t3;
t4 *= t4;
t3 += t4;
t5 *= t5;
t3 += t5;
d[i + 2] = t0;
d[i + 3] = t3;
}
for (i = m, i3 = 3 * m; i < n; i++, i3 += 3)
{
t0 = u[i3 + 0] - v0;
t1 = u[i3 + 1] - v1;
t0 *= t0;
t2 = u[i3 + 2] - v2;
t1 *= t1;
t2 *= t2;
t0 += t1;
t0 += t2;
d[i] = t0;
}
}
/* This is Test2 */
# include <stdio.h>
#define LOOP 1000
#ifdef linux
#define Hz 1.e-2
#else
#define Hz 1.e-6
#endif
extern long int clock (void);
extern void rot (float *, float *, float *, int);
float
Timer (void)
{
return (Hz) * clock();
}
void
main (int argc, char *argv[])
{
float f1, t1;
long i, n;
float r[12] = {
0., 0., 1., 1.,
2., 0., 0., 2.,
0., 3., 0., -1. };
float x[3000], y[3000];
for (i = 0; i < 3000; i++)
x[i] = (i % 3) + 1;
if (argc == 1)
n=10;
else
sscanf (argv[1], "%d", &n);
printf("Transf Benchmark: loop length : %5d * 10^3\n", n);
n*=LOOP;
t1=Timer();
for (i=0; i<n; i++)
rot (r, x, y, 1000);
#ifdef DEBUG
printf ("%f %f %f %f %f %f\n",
x[0], x[1], x[2],
y[0], y[1], y[2]);
printf ("%f %f %f %f %f %f\n",
x[2997], x[2998], x[2999],
y[2997], y[2998], y[2999]);
#endif
t1=Timer() - t1;
if (t1 > 0.)
f1=18.e-3*n/t1;
else
f1=-1.;
printf("Transf Benchmark: Time/MegaFlops: %10.2f %10.2f\n", t1, f1);
}
void
rot (float *r, float *x, float *y, int n)
{
int i, i3;
float t0, t1, t2, t3, t4, t5;
for (i = i3 = 0; i < n; i++, i3 += 3)
{
t0 = r[4];
t1 = r[0] * x[i3 + 0];
t2 = r[1] * x[i3 + 1];
t0 += t1;
t1 = r[2] * x[i3 + 2];
t0 += t2;
t0 += t1;
t3 = r[7];
t4 = r[4] * x[i3 + 0];
t5 = r[5] * x[i3 + 1];
t3 += t4;
t4 = r[6] * x[i3 + 2];
t3 += t5;
y[i3 + 0] = t0;
t3 += t4;
t0 = r[11];
t1 = r[8] * x[i3 + 0];
t2 = r[9] * x[i3 + 1];
t0 += t1;
t1 = r[10] * x[i3 + 2];
t0 += t2;
y[i3 + 1] = t3;
t0 += t1;
y[i3 + 2] = t0;
}
}
/* This is Test3 */
#include <stdio.h>
#ifdef linux
#define Hz 1.e-2
#else
#define Hz 1.e-6
#endif
extern long int clock (void);
float
second ()
{
return (Hz) * clock ();
}
void
report (int nx, int lx, float op, float tm)
{
float mf = -1.;
if (tm > 0.)
mf = op / tm;
printf ("%10d%10d%10.2f%10.2f\n", nx, lx, tm, mf);
}
extern void svm (float, float *, float *, int);
extern void svmva (float, float *, float *, float *, int);
extern void svmsa (float, float *, float, float *, int);
extern void vvm (float *, float *, float *, int);
extern void vva (float *, float *, float *, int);
extern float vsum (float *, int);
extern float vdot (float *, float *, int);
void
main (int argc, char *argv[])
{
#define LOOP 2000000
#define NM 7
#define NNX 500
int OP, i, j, k, nx, nnx, lx, mmx;
float a = .1, d, *x, *y, op, tm;
double *px, *py, *pz;
long str, LX, NX[NM] =
{500, 1000, 5000, 10000, 50000, 100000, 500000};
if (argc < 2)
{
printf ("Usage : %s loop-multiplier [stride]\n", argv[0]);
exit (1);
}
sscanf (argv[1], "%d", &LX);
str = NNX;
OP = 10;
if (argc > 2)
sscanf (argv[2], "%d", &str);
if (str <= 0)
str = NX[NM];
LX *= LOOP;
printf (" Size Loops Time MFlops\n");
for (j = 0; j < NM; j++)
{
nx = NX[j];
lx = LX / nx;
x = (float *) malloc (nx * sizeof (float));
y = (float *) malloc (nx * sizeof (float));
if (x == (float *) 0)
{
puts ("Cannot allocate memory for x[]\n");
exit (-1);
}
if (y == (float *) 0)
{
puts ("Cannot allocate memory for y[]\n");
exit (-1);
}
for (i = 0; i < nx; i++)
{
x[i] = 1.;
y[i] = 0.;
}
op = (1.e-3 * (OP * nx)) * (1.e-3 * lx);
d = 0.;
tm = second ();
nnx = (nx < str ? nx : str);
for (k = 0; k < lx; k++)
for (i = 0; i < nx; i += nnx)
{
mmx = nx - i;
mmx = (mmx < nnx ? mmx : nnx);
svm (a, &x[i], &y[i], mmx);
svmva (a, &x[i], &y[i], &y[i], mmx);
svmsa (a, &x[i], a, &y[i], mmx);
vvm (&x[i], &y[i], &y[i], mmx);
vva (&x[i], &y[i], &y[i], mmx);
d += vsum (&x[i], mmx);
d -= vdot (&x[i], &x[i], mmx);
}
tm = second () - tm;
report (nx, lx, op, tm);
free (x);
free (y);
}
}
void
svm (float a, float *x, float *y, int n)
{
int i, m;
float t1, t2;
m = n & ~7L;
for (i = 0; i < m; i += 8)
{
t1 = a * x[i + 0];
t2 = a * x[i + 1];
y[i + 0] = t1;
t1 = a * x[i + 2];
y[i + 1] = t2;
t2 = a * x[i + 3];
y[i + 2] = t1;
t1 = a * x[i + 4];
y[i + 3] = t2;
t2 = a * x[i + 5];
y[i + 4] = t1;
t1 = a * x[i + 6];
y[i + 5] = t2;
t2 = a * x[i + 7];
y[i + 6] = t1;
y[i + 7] = t2;
}
for (i = m; i < n; i++)
y[i] = a * x[i];
return;
}
void
svmva (float a, float *x, float *y, float *z, int n)
{
int i, m;
float t1, t2;
m = n & ~7L;
for (i = 0; i < m; i += 8)
{
t1 = a * x[i + 0];
t2 = a * x[i + 1];
t1 += y[i + 0];
t2 += y[i + 1];
z[i + 0] = t1;
t1 = a * x[i + 2];
z[i + 1] = t2;
t1 += y[i + 2];
t2 = a * x[i + 3];
z[i + 2] = t1;
t2 += y[i + 3];
t1 = a * x[i + 4];
z[i + 3] = t2;
t1 += y[i + 4];
t2 = a * x[i + 5];
z[i + 4] = t1;
t2 += y[i + 5];
t1 = a * x[i + 6];
z[i + 5] = t2;
t2 = a * x[i + 7];
t1 += y[i + 6];
t2 += y[i + 7];
z[i + 6] = t1;
z[i + 7] = t2;
}
for (i = m; i < n; i++)
z[i] = y[i] + a * x[i];
return;
}
void
svmsa (float a, float *x, float b, float *y, int n)
{
int i, m;
float t1, t2;
m = n & ~7L;
for (i = 0; i < m; i += 8)
{
t1 = a * x[i + 0];
t2 = a * x[i + 1];
t1 += b;
t2 += b;
y[i + 0] = t1;
t1 = a * x[i + 2];
y[i + 1] = t2;
t1 += b;
t2 = a * x[i + 3];
y[i + 2] = t1;
t2 += b;
t1 = a * x[i + 4];
y[i + 3] = t2;
t1 += b;
t2 = a * x[i + 5];
y[i + 4] = t1;
t2 += b;
t1 = a * x[i + 6];
y[i + 5] = t2;
t2 = a * x[i + 7];
t1 += b;
t2 += b;
y[i + 6] = t1;
y[i + 7] = t2;
}
for (i = m; i < n; i++)
y[i] = a * x[i] + b;
return;
}
void
vvm (float *x, float *y, float *z, int n)
{
int i, m;
float t1, t2;
m = n & ~7L;
for (i = 0; i < m; i += 8)
{
t1 = x[i + 0] * y[i + 0];
t2 = x[i + 1] * y[i + 1];
z[i + 0] = t1;
t1 = x[i + 2] * y[i + 2];
z[i + 1] = t2;
t2 = x[i + 3] * y[i + 3];
z[i + 2] = t1;
t1 = x[i + 4] * y[i + 4];
z[i + 3] = t2;
t2 = x[i + 5] * y[i + 5];
z[i + 4] = t1;
t1 = x[i + 6] * y[i + 6];
z[i + 5] = t2;
t2 = x[i + 7] * y[i + 7];
z[i + 6] = t1;
z[i + 7] = t2;
}
for (i = m; i < n; i++)
z[i] = x[i] * y[i];
return;
}
void
vva (float *x, float *y, float *z, int n)
{
int i, m;
float t1, t2;
m = n & ~7L;
for (i = 0; i < m; i += 8)
{
t1 = x[i + 0] + y[i + 0];
t2 = x[i + 1] + y[i + 1];
z[i + 0] = t1;
t1 = x[i + 2] + y[i + 2];
z[i + 1] = t2;
t2 = x[i + 3] + y[i + 3];
z[i + 2] = t1;
t1 = x[i + 4] + y[i + 4];
z[i + 3] = t2;
t2 = x[i + 5] + y[i + 5];
z[i + 4] = t1;
t1 = x[i + 6] + y[i + 6];
z[i + 5] = t2;
t2 = x[i + 7] + y[i + 7];
z[i + 6] = t1;
z[i + 7] = t2;
}
for (i = m; i < n; i++)
z[i] = x[i] + y[i];
return;
}
float
vsum (float *x, int n)
{
int i, m;
float vs, t1, t2, t3, t4;
m = n & ~7L;
t1 = t2 = 0.f;
for (i = 0; i < m; i += 8)
{
t1 += x[i + 0];
t2 += x[i + 1];
t1 += x[i + 4];
t2 += x[i + 5];
t1 += x[i + 2];
t2 += x[i + 3];
t1 += x[i + 6];
t2 += x[i + 7];
}
vs = t1 + t2;
for (i = m; i < n; i++)
vs += x[i];
return vs;
}
float
vdot (float *x, float *y, int n)
{
int i, m;
float vs, t1, t2, s1, s2;
m = n & ~7L;
s1 = s2 = t1 = t2 = 0.f;
for (i = 0; i < m; i += 8)
{
t1 = x[i + 0] * y[i + 0];
s2 += t2;
t2 = x[i + 1] * y[i + 1];
s1 += t1;
t1 = x[i + 2] * y[i + 2];
s2 += t2;
t2 = x[i + 3] * y[i + 3];
s1 += t1;
t1 = x[i + 4] * y[i + 4];
s2 += t2;
t2 = x[i + 5] * y[i + 5];
s1 += t1;
t1 = x[i + 6] * y[i + 6];
s2 += t2;
t2 = x[i + 7] * y[i + 7];
s1 += t1;
}
vs = s1 + s2;
for (i = m; i < n; i++)
vs += x[i] * y[i];
return vs;
}
--A41C680A4.834507056=_/vger.rutgers.edu--