Re: [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+

From: linux
Date: Tue Jun 12 2007 - 01:05:59 EST


> I got this code from Nettle, originally, and I never looked at the SHA-1
> round structure very closely. I'll give that approach a try.

Attached is some (tested, working, and public domain) assembly code for
three different sha_transform implementations. Compared to C code, the
timings to hash 10 MiB on a 600 MHz PIII are:

One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 564819 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 391086 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 399134 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 345986 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 301152 us

One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 558652 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 390980 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 407661 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 412434 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 266809 us

One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 559053 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 396506 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 401661 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 349668 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 265861 us

One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 556082 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 392967 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 406381 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 338959 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 274712 us

Um.. some more runs, nice --19, that come out a bit more stable:
One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 552971 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 388167 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 398721 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 337220 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 259790 us

One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 551240 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 387812 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 398519 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 336903 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 260161 us

One: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 551934 us
Four: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 387639 us
Two: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 398094 us
Three: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 335860 us
Five: e02e77e3 bb7af36d bbf79d4a 46956044 2aaea172 -- 259805 us

This is hot-cache testing; I haven't got around to writing macro
tricks that exapnd to a megabyte of object code. The challenge is to
purge not only the I- and D-caches, but also the branch predictor!


The names are the order they were written in. "One" is the lib/sha1.c
code (547 bytes with -Os). "Four" is a 5x unrolled C version (1106 bytes).

"Two" is a space-optimized ASM version, 266 bytes long. "Three" is 5x
unrolled, 722 bytes long. "Five" is a fully unrolled version, 3558
bytes long.

(Further space savings are possible, but it doesn't seem worth it.)

I have noticed that every caller of sha_transform in the kernel tree
allocates the W[] array on the stack, so we might as well do that inside
sha_transform. The point of passing in the buffer is to amortize the
wiping afterwards, but see sha_stackwipe for ideas on how to do that.
(It can even be done mostly portably in C, given a good guess about the
C function's stack usage.)


I also noticed a glaring BUG in the folding at the end of extract_buf at
drivers/char/random.c:797. That should be:

/*
* In case the hash function has some recognizable
* output pattern, we fold it in half.
*/

buf[0] ^= buf[4];
buf[1] ^= buf[3];
buf[2] ^= rol32(buf[2], 16); // <--- Bug was here
memcpy(out, buf, EXTRACT_SIZE);
memset(buf, 0, sizeof(buf));

if the code is to match the comment.



=== sha1asm.S ===
#define A %eax
#define B %ebx
#define C %ecx
#define D %edx
#define E %ebp
#define I %esi
#define T %edi

# f1(x,y,z) = bitwise x ? y : z = (z ^ (x & (y ^ z)))
#define F1(x,y,z,dest) \
movl z,T; \
xorl y,T; \
andl x,T; \
xorl z,T

# f2(x,y,z) = x ^ y ^ z
#define F2(x,y,z,dest) \
movl z,T; \
xorl x,T; \
xorl y,T

# f3(x,y,z) = majority(x,y,z) = ((x & z) + (y & (x ^ z)))
#define F3(x,y,z,dest) \
movl z,T; \
andl x,T; \
addl T,dest; \
movl z,T; \
xorl x,T; \
andl y,T

#define K1 0x5A827999 /* Rounds 0-19: sqrt(2) * 2^30 */
#define K2 0x6ED9EBA1 /* Rounds 20-39: sqrt(3) * 2^30 */
#define K3 0x8F1BBCDC /* Rounds 40-59: sqrt(5) * 2^30 */
#define K4 0xCA62C1D6 /* Rounds 60-79: sqrt(10) * 2^30 */

# e += W[i] + K + f(b, c, d) + rol32(a, 5); b = rol32(b, 30); i++;
#define ROUND(a,b,c,d,e,f,k) \
addl (%esp,I,4),e; \
incl I; \
f(b,c,d,e); \
leal k(T,e),e; \
movl a,T; \
roll $5,T; \
rorl $2,b; \
addl T,e

.text
.globl sha_transform3
.type sha_transform3, @function
# void sha_transform3(__u32 digest[5], const char in[64])
sha_transform3:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
# Args start at 20(%esp)
xorl I,I
movl 24(%esp),B # B = in
movl 20(%esp),T # T = digest
subl $320,%esp
1:
movl (B,I,4),A
bswap A
movl A,(%esp,I,4)
incl I
cmpl $16,I
jne 1b

2:
movl -64(%esp,I,4),A
xorl -56(%esp,I,4),A
xorl -32(%esp,I,4),A
xorl -12(%esp,I,4),A
roll $1,A
movl A,(%esp,I,4)
incl I
cmpl $80,I
jne 2b

movl (T),A
movl 4(T),B
movl 8(T),C
movl 12(T),D
movl 16(T),E

xorl I,I
3:
ROUND(A,B,C,D,E,F1,K1)
ROUND(E,A,B,C,D,F1,K1)
ROUND(D,E,A,B,C,F1,K1)
ROUND(C,D,E,A,B,F1,K1)
ROUND(B,C,D,E,A,F1,K1)
cmp $20,I
jne 3b
4:
ROUND(A,B,C,D,E,F2,K2)
ROUND(E,A,B,C,D,F2,K2)
ROUND(D,E,A,B,C,F2,K2)
ROUND(C,D,E,A,B,F2,K2)
ROUND(B,C,D,E,A,F2,K2)
cmp $40,I
jne 4b
5:
ROUND(A,B,C,D,E,F3,K3)
ROUND(E,A,B,C,D,F3,K3)
ROUND(D,E,A,B,C,F3,K3)
ROUND(C,D,E,A,B,F3,K3)
ROUND(B,C,D,E,A,F3,K3)
cmp $60,I
jne 5b
6:
ROUND(A,B,C,D,E,F2,K4)
ROUND(E,A,B,C,D,F2,K4)
ROUND(D,E,A,B,C,F2,K4)
ROUND(C,D,E,A,B,F2,K4)
ROUND(B,C,D,E,A,F2,K4)
cmp $80,I
jne 6b

addl $320,%esp
movl 20(%esp),T

addl A,(T)
addl B,4(T)
addl C,8(T)
addl D,12(T)
addl E,16(T)

popl %ebx
popl %esi
popl %edi
popl %ebp

ret
.size sha_transform3, .-sha_transform3
# Size is 0x2D2 = 722 bytes

# A smaller variant
#define ROUND2(a,b,c,d,e,f,k) \
addl (%esp,I,4),e; \
incl I; \
f(b,c,d,e); \
leal k(T,e),T; \
movl d,e; \
movl c,d; \
movl b,c; \
rorl $2,c; \
movl a,b; \
roll $5,a; \
addl T,a

.globl sha_transform2
.type sha_transform2, @function
# void sha_transform2(__u32 digest[5], const char in[64])
sha_transform2:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
# Args start at 20(%esp)
xorl I,I
movl 24(%esp),B # B = in
movl 20(%esp),T # T = digest
subl $320,%esp
1:
movl (B,I,4),A
bswap A
movl A,(%esp,I,4)
incl I
cmpl $16,I
jne 1b

2:
movl -64(%esp,I,4),A
xorl -56(%esp,I,4),A
xorl -32(%esp,I,4),A
xorl -12(%esp,I,4),A
roll $1,A
movl A,(%esp,I,4)
incl I
cmpl $80,I
jne 2b

movl (T),A
movl 4(T),B
movl 8(T),C
movl 12(T),D
movl 16(T),E

xorl I,I
3:
ROUND2(A,B,C,D,E,F1,K1)
cmp $20,I
jne 3b
4:
ROUND2(A,B,C,D,E,F2,K2)
cmp $40,I
jne 4b
5:
ROUND2(A,B,C,D,E,F3,K3)
cmp $60,I
jne 5b
6:
ROUND2(A,B,C,D,E,F2,K4)
cmp $80,I
jne 6b

addl $320,%esp
movl 20(%esp),T
addl A,(T)
addl B,4(T)
addl C,8(T)
addl D,12(T)
addl E,16(T)

popl %ebx
popl %esi
popl %edi
popl %ebp

ret
.size sha_transform2, .-sha_transform2
# Size is 0x10A = 266 bytes

# The three cases of the next input word...
# Fetch big-endian (first 16 rounds)
#define FETCH(i) \
movl 4*i(I),T; \
bswap T; \
movl T,4*i(%esp)

# Calculate but don't store (last 3 rounds)
#define CALCX(i) \
movl 4*(i&15)(%esp),T; \
xorl 4*((i+2)&15)(%esp),T; \
xorl 4*((i+8)&15)(%esp),T; \
xorl 4*((i+13)&15)(%esp),T; \
roll $1,T

# Calculate and store on stack (middle 61 rounds)
#define CALC(i) \
CALCX(i); \
movl T,4*(i&15)(%esp)

# e += W[i] + K + f(b, c, d) + rol32(a, 5); b = rol32(b, 30); i++;
#define ROUND5a(a,b,c,d,e,f,k) \
leal k(T,e),e; \
f(b,c,d,e); \
addl T,e; \
movl a,T; \
roll $5,T; \
rorl $2,b; \
addl T,e

# A variant that assumes that k is stored in I
#define ROUND5b(a,b,c,d,e,f) \
addl I,e; \
addl T,e; \
f(b,c,d,e); \
addl T,e; \
movl a,T; \
roll $5,T; \
rorl $2,b; \
addl T,e

.globl sha_transform5
.type sha_transform5, @function
# void sha_transform5(__u32 digest[5], const char in[64])
sha_transform5:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
# Args start at 20(%esp)
movl 24(%esp),I # I = in
movl 20(%esp),T # T = digest
subl $64,%esp

movl (T),A
movl 4(T),B
movl 8(T),C
movl 12(T),D
movl 16(T),E

FETCH(0); ROUND5a(A,B,C,D,E,F1,K1)
FETCH(1); ROUND5a(E,A,B,C,D,F1,K1)
FETCH(2); ROUND5a(D,E,A,B,C,F1,K1)
FETCH(3); ROUND5a(C,D,E,A,B,F1,K1)
FETCH(4); ROUND5a(B,C,D,E,A,F1,K1)

FETCH(5); ROUND5a(A,B,C,D,E,F1,K1)
FETCH(6); ROUND5a(E,A,B,C,D,F1,K1)
FETCH(7); ROUND5a(D,E,A,B,C,F1,K1)
FETCH(8); ROUND5a(C,D,E,A,B,F1,K1)
FETCH(9); ROUND5a(B,C,D,E,A,F1,K1)

FETCH(10); ROUND5a(A,B,C,D,E,F1,K1)
FETCH(11); ROUND5a(E,A,B,C,D,F1,K1)
FETCH(12); ROUND5a(D,E,A,B,C,F1,K1)
FETCH(13); ROUND5a(C,D,E,A,B,F1,K1)
FETCH(14); ROUND5a(B,C,D,E,A,F1,K1)

FETCH(15); movl $K1,I; ROUND5b(A,B,C,D,E,F1)
CALC(16); ROUND5b(E,A,B,C,D,F1)
CALC(17); ROUND5b(D,E,A,B,C,F1)
CALC(18); ROUND5b(C,D,E,A,B,F1)
CALC(19); ROUND5b(B,C,D,E,A,F1)

movl $K2,I

CALC(20); ROUND5b(A,B,C,D,E,F2)
CALC(21); ROUND5b(E,A,B,C,D,F2)
CALC(22); ROUND5b(D,E,A,B,C,F2)
CALC(23); ROUND5b(C,D,E,A,B,F2)
CALC(24); ROUND5b(B,C,D,E,A,F2)

CALC(25); ROUND5b(A,B,C,D,E,F2)
CALC(26); ROUND5b(E,A,B,C,D,F2)
CALC(27); ROUND5b(D,E,A,B,C,F2)
CALC(28); ROUND5b(C,D,E,A,B,F2)
CALC(29); ROUND5b(B,C,D,E,A,F2)

CALC(30); ROUND5b(A,B,C,D,E,F2)
CALC(31); ROUND5b(E,A,B,C,D,F2)
CALC(32); ROUND5b(D,E,A,B,C,F2)
CALC(33); ROUND5b(C,D,E,A,B,F2)
CALC(34); ROUND5b(B,C,D,E,A,F2)

CALC(35); ROUND5b(A,B,C,D,E,F2)
CALC(36); ROUND5b(E,A,B,C,D,F2)
CALC(37); ROUND5b(D,E,A,B,C,F2)
CALC(38); ROUND5b(C,D,E,A,B,F2)
CALC(39); ROUND5b(B,C,D,E,A,F2)

movl $K3,I

CALC(40); ROUND5b(A,B,C,D,E,F3)
CALC(41); ROUND5b(E,A,B,C,D,F3)
CALC(42); ROUND5b(D,E,A,B,C,F3)
CALC(43); ROUND5b(C,D,E,A,B,F3)
CALC(44); ROUND5b(B,C,D,E,A,F3)

CALC(45); ROUND5b(A,B,C,D,E,F3)
CALC(46); ROUND5b(E,A,B,C,D,F3)
CALC(47); ROUND5b(D,E,A,B,C,F3)
CALC(48); ROUND5b(C,D,E,A,B,F3)
CALC(49); ROUND5b(B,C,D,E,A,F3)

CALC(50); ROUND5b(A,B,C,D,E,F3)
CALC(51); ROUND5b(E,A,B,C,D,F3)
CALC(52); ROUND5b(D,E,A,B,C,F3)
CALC(53); ROUND5b(C,D,E,A,B,F3)
CALC(54); ROUND5b(B,C,D,E,A,F3)

CALC(55); ROUND5b(A,B,C,D,E,F3)
CALC(56); ROUND5b(E,A,B,C,D,F3)
CALC(57); ROUND5b(D,E,A,B,C,F3)
CALC(58); ROUND5b(C,D,E,A,B,F3)
CALC(59); ROUND5b(B,C,D,E,A,F3)

movl $K4,I

CALC(60); ROUND5b(A,B,C,D,E,F2)
CALC(61); ROUND5b(E,A,B,C,D,F2)
CALC(62); ROUND5b(D,E,A,B,C,F2)
CALC(63); ROUND5b(C,D,E,A,B,F2)
CALC(64); ROUND5b(B,C,D,E,A,F2)

CALC(65); ROUND5b(A,B,C,D,E,F2)
CALC(66); ROUND5b(E,A,B,C,D,F2)
CALC(67); ROUND5b(D,E,A,B,C,F2)
CALC(68); ROUND5b(C,D,E,A,B,F2)
CALC(69); ROUND5b(B,C,D,E,A,F2)

CALC(70); ROUND5b(A,B,C,D,E,F2)
CALC(71); ROUND5b(E,A,B,C,D,F2)
CALC(72); ROUND5b(D,E,A,B,C,F2)
CALC(73); ROUND5b(C,D,E,A,B,F2)
CALC(74); ROUND5b(B,C,D,E,A,F2)

CALC(75); ROUND5b(A,B,C,D,E,F2)
CALC(76); ROUND5b(E,A,B,C,D,F2)
CALCX(77); ROUND5b(D,E,A,B,C,F2)
CALCX(78); ROUND5b(C,D,E,A,B,F2)
CALCX(79); ROUND5b(B,C,D,E,A,F2)

addl $64,%esp
movl 20(%esp),I

#if 1
addl A,(I)
addl B,4(I)
addl C,8(I)
addl D,12(I)
addl E,16(I)
#else
movl A,(I)
movl B,4(I)
movl C,8(I)
movl D,12(I)
movl E,16(I)
#endif

popl %ebx
popl %esi
popl %edi
popl %ebp

ret
.size sha_transform5, .-sha_transform5
# Size is 0xDE6 = 3558 bytes


.globl sha_stackwipe
.type sha_stackwipe, @function
# void sha_stackwipe(void)
# After one or more sha_transform calls, we have left the contents of W[]
# on the stack, and from any 16 of those 80 words, the entire input
# can be reconstructed. If the caller cares, this function obliterates
# the relevant portion of the stack.
# 2 words of argument + 4 woirds of saved registers + 80 words of W[]
sha_stackwipe:
xorl %eax,%eax
movl $86,%ecx
# Damn, I had hoped that loop; pushl %eax would work..
1:
decl %ecx
pushl %eax
jne 1b

addl $4*86,%esp
ret
.size sha_stackwipe, .-sha_stackwipe
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/