[PATCH 4.14 46/54] crypto: x86/salsa20 - remove x86 salsa20 implementations

From: Greg Kroah-Hartman
Date: Mon Jul 16 2018 - 03:42:10 EST


4.14-stable review patch. If anyone has any objections, please let me know.

------------------

From: Eric Biggers <ebiggers@xxxxxxxxxx>

commit b7b73cd5d74694ed59abcdb4974dacb4ff8b2a2a upstream.

The x86 assembly implementations of Salsa20 use the frame base pointer
register (%ebp or %rbp), which breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.
Recent (v4.10+) kernels will warn about this, e.g.

WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c
[...]

But after looking into it, I believe there's very little reason to still
retain the x86 Salsa20 code. First, these are *not* vectorized
(SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere
close to the best Salsa20 performance on any remotely modern x86
processor; they're just regular x86 assembly. Second, it's still
unclear that anyone is actually using the kernel's Salsa20 at all,
especially given that now ChaCha20 is supported too, and with much more
efficient SSSE3 and AVX2 implementations. Finally, in benchmarks I did
on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the
x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic
(~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm
is only slightly faster than salsa20-generic (~15% faster on Skylake,
~20% faster on Zen). The gcc version made little difference.

So, the x86_64 salsa20-asm is pretty clearly useless. That leaves just
the i686 salsa20-asm, which based on my tests provides a 15-20% speed
boost. But that's without updating the code to not use %ebp. And given
the maintenance cost, the small speed difference vs. salsa20-generic,
the fact that few people still use i686 kernels, the doubt that anyone
is even using the kernel's Salsa20 at all, and the fact that a SSE2
implementation would almost certainly be much faster on any remotely
modern x86 processor yet no one has cared enough to add one yet, I don't
think it's worthwhile to keep.

Thus, just remove both the x86_64 and i686 salsa20-asm implementations.

Reported-by: syzbot+ffa3a158337bbc01ff09@xxxxxxxxxxxxxxxxxxxxxxxxx
Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

---
arch/x86/crypto/Makefile | 4
arch/x86/crypto/salsa20-i586-asm_32.S | 1114 --------------------------------
arch/x86/crypto/salsa20-x86_64-asm_64.S | 919 --------------------------
arch/x86/crypto/salsa20_glue.c | 116 ---
crypto/Kconfig | 26
5 files changed, 2179 deletions(-)

--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) +=

obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o

obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) +=
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@@ -59,7 +57,6 @@ endif

aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o

aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@@ -68,7 +65,6 @@ camellia-x86_64-y := camellia-x86_64-asm
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o

--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ /dev/null
@@ -1,1114 +0,0 @@
-# salsa20_pm.s version 20051229
-# D. J. Bernstein
-# Public domain.
-
-#include <linux/linkage.h>
-
-.text
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,80(%esp)
- # ebx_stack = ebx
- movl %ebx,84(%esp)
- # esi_stack = esi
- movl %esi,88(%esp)
- # edi_stack = edi
- movl %edi,92(%esp)
- # ebp_stack = ebp
- movl %ebp,96(%esp)
- # x = arg1
- movl 4(%esp,%eax),%edx
- # m = arg2
- movl 8(%esp,%eax),%esi
- # out = arg3
- movl 12(%esp,%eax),%edi
- # bytes = arg4
- movl 16(%esp,%eax),%ebx
- # bytes -= 0
- sub $0,%ebx
- # goto done if unsigned<=
- jbe ._done
-._start:
- # in0 = *(uint32 *) (x + 0)
- movl 0(%edx),%eax
- # in1 = *(uint32 *) (x + 4)
- movl 4(%edx),%ecx
- # in2 = *(uint32 *) (x + 8)
- movl 8(%edx),%ebp
- # j0 = in0
- movl %eax,164(%esp)
- # in3 = *(uint32 *) (x + 12)
- movl 12(%edx),%eax
- # j1 = in1
- movl %ecx,168(%esp)
- # in4 = *(uint32 *) (x + 16)
- movl 16(%edx),%ecx
- # j2 = in2
- movl %ebp,172(%esp)
- # in5 = *(uint32 *) (x + 20)
- movl 20(%edx),%ebp
- # j3 = in3
- movl %eax,176(%esp)
- # in6 = *(uint32 *) (x + 24)
- movl 24(%edx),%eax
- # j4 = in4
- movl %ecx,180(%esp)
- # in7 = *(uint32 *) (x + 28)
- movl 28(%edx),%ecx
- # j5 = in5
- movl %ebp,184(%esp)
- # in8 = *(uint32 *) (x + 32)
- movl 32(%edx),%ebp
- # j6 = in6
- movl %eax,188(%esp)
- # in9 = *(uint32 *) (x + 36)
- movl 36(%edx),%eax
- # j7 = in7
- movl %ecx,192(%esp)
- # in10 = *(uint32 *) (x + 40)
- movl 40(%edx),%ecx
- # j8 = in8
- movl %ebp,196(%esp)
- # in11 = *(uint32 *) (x + 44)
- movl 44(%edx),%ebp
- # j9 = in9
- movl %eax,200(%esp)
- # in12 = *(uint32 *) (x + 48)
- movl 48(%edx),%eax
- # j10 = in10
- movl %ecx,204(%esp)
- # in13 = *(uint32 *) (x + 52)
- movl 52(%edx),%ecx
- # j11 = in11
- movl %ebp,208(%esp)
- # in14 = *(uint32 *) (x + 56)
- movl 56(%edx),%ebp
- # j12 = in12
- movl %eax,212(%esp)
- # in15 = *(uint32 *) (x + 60)
- movl 60(%edx),%eax
- # j13 = in13
- movl %ecx,216(%esp)
- # j14 = in14
- movl %ebp,220(%esp)
- # j15 = in15
- movl %eax,224(%esp)
- # x_backup = x
- movl %edx,64(%esp)
-._bytesatleast1:
- # bytes - 64
- cmp $64,%ebx
- # goto nocopy if unsigned>=
- jae ._nocopy
- # ctarget = out
- movl %edi,228(%esp)
- # out = &tmp
- leal 0(%esp),%edi
- # i = bytes
- mov %ebx,%ecx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # out = &tmp
- leal 0(%esp),%edi
- # m = &tmp
- leal 0(%esp),%esi
-._nocopy:
- # out_backup = out
- movl %edi,72(%esp)
- # m_backup = m
- movl %esi,68(%esp)
- # bytes_backup = bytes
- movl %ebx,76(%esp)
- # in0 = j0
- movl 164(%esp),%eax
- # in1 = j1
- movl 168(%esp),%ecx
- # in2 = j2
- movl 172(%esp),%edx
- # in3 = j3
- movl 176(%esp),%ebx
- # x0 = in0
- movl %eax,100(%esp)
- # x1 = in1
- movl %ecx,104(%esp)
- # x2 = in2
- movl %edx,108(%esp)
- # x3 = in3
- movl %ebx,112(%esp)
- # in4 = j4
- movl 180(%esp),%eax
- # in5 = j5
- movl 184(%esp),%ecx
- # in6 = j6
- movl 188(%esp),%edx
- # in7 = j7
- movl 192(%esp),%ebx
- # x4 = in4
- movl %eax,116(%esp)
- # x5 = in5
- movl %ecx,120(%esp)
- # x6 = in6
- movl %edx,124(%esp)
- # x7 = in7
- movl %ebx,128(%esp)
- # in8 = j8
- movl 196(%esp),%eax
- # in9 = j9
- movl 200(%esp),%ecx
- # in10 = j10
- movl 204(%esp),%edx
- # in11 = j11
- movl 208(%esp),%ebx
- # x8 = in8
- movl %eax,132(%esp)
- # x9 = in9
- movl %ecx,136(%esp)
- # x10 = in10
- movl %edx,140(%esp)
- # x11 = in11
- movl %ebx,144(%esp)
- # in12 = j12
- movl 212(%esp),%eax
- # in13 = j13
- movl 216(%esp),%ecx
- # in14 = j14
- movl 220(%esp),%edx
- # in15 = j15
- movl 224(%esp),%ebx
- # x12 = in12
- movl %eax,148(%esp)
- # x13 = in13
- movl %ecx,152(%esp)
- # x14 = in14
- movl %edx,156(%esp)
- # x15 = in15
- movl %ebx,160(%esp)
- # i = 20
- mov $20,%ebp
- # p = x0
- movl 100(%esp),%eax
- # s = x5
- movl 120(%esp),%ecx
- # t = x10
- movl 140(%esp),%edx
- # w = x15
- movl 160(%esp),%ebx
-._mainloop:
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x12
- addl 148(%esp),%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x6
- addl 124(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x1
- movl 104(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x11
- movl 144(%esp),%edi
- # v += w
- add %ebx,%edi
- # p <<<= 7
- rol $7,%eax
- # p ^= x4
- xorl 116(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x14
- xorl 156(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x9
- xorl 136(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x3
- xorl 112(%esp),%edi
- # x4 = p
- movl %eax,116(%esp)
- # x14 = t
- movl %edx,156(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x9 = r
- movl %esi,136(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x3 = v
- movl %edi,112(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x8
- xorl 132(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x2
- xorl 108(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x13
- xorl 152(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x7
- xorl 128(%esp),%ebx
- # x8 = p
- movl %eax,132(%esp)
- # x2 = t
- movl %edx,108(%esp)
- # p += x4
- addl 116(%esp),%eax
- # x13 = s
- movl %ecx,152(%esp)
- # t += x14
- addl 156(%esp),%edx
- # x7 = w
- movl %ebx,128(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x12
- xorl 148(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x6
- xorl 124(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x1
- xorl 104(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x11
- xorl 144(%esp),%edi
- # x12 = p
- movl %eax,148(%esp)
- # x6 = t
- movl %edx,124(%esp)
- # p += x8
- addl 132(%esp),%eax
- # x1 = r
- movl %esi,104(%esp)
- # t += x2
- addl 108(%esp),%edx
- # x11 = v
- movl %edi,144(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x3
- addl 112(%esp),%eax
- # p <<<= 7
- rol $7,%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x9
- addl 136(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x4
- movl 116(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x14
- movl 156(%esp),%edi
- # v += w
- add %ebx,%edi
- # p ^= x1
- xorl 104(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x11
- xorl 144(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x6
- xorl 124(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x12
- xorl 148(%esp),%edi
- # x1 = p
- movl %eax,104(%esp)
- # x11 = t
- movl %edx,144(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x6 = r
- movl %esi,124(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x12 = v
- movl %edi,148(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x2
- xorl 108(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x8
- xorl 132(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x7
- xorl 128(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x13
- xorl 152(%esp),%ebx
- # x2 = p
- movl %eax,108(%esp)
- # x8 = t
- movl %edx,132(%esp)
- # p += x1
- addl 104(%esp),%eax
- # x7 = s
- movl %ecx,128(%esp)
- # t += x11
- addl 144(%esp),%edx
- # x13 = w
- movl %ebx,152(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x3
- xorl 112(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x9
- xorl 136(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x4
- xorl 116(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x14
- xorl 156(%esp),%edi
- # x3 = p
- movl %eax,112(%esp)
- # x9 = t
- movl %edx,136(%esp)
- # p += x2
- addl 108(%esp),%eax
- # x4 = r
- movl %esi,116(%esp)
- # t += x8
- addl 132(%esp),%edx
- # x14 = v
- movl %edi,156(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x12
- addl 148(%esp),%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x6
- addl 124(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x1
- movl 104(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x11
- movl 144(%esp),%edi
- # v += w
- add %ebx,%edi
- # p <<<= 7
- rol $7,%eax
- # p ^= x4
- xorl 116(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x14
- xorl 156(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x9
- xorl 136(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x3
- xorl 112(%esp),%edi
- # x4 = p
- movl %eax,116(%esp)
- # x14 = t
- movl %edx,156(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x9 = r
- movl %esi,136(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x3 = v
- movl %edi,112(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x8
- xorl 132(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x2
- xorl 108(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x13
- xorl 152(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x7
- xorl 128(%esp),%ebx
- # x8 = p
- movl %eax,132(%esp)
- # x2 = t
- movl %edx,108(%esp)
- # p += x4
- addl 116(%esp),%eax
- # x13 = s
- movl %ecx,152(%esp)
- # t += x14
- addl 156(%esp),%edx
- # x7 = w
- movl %ebx,128(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x12
- xorl 148(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x6
- xorl 124(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x1
- xorl 104(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x11
- xorl 144(%esp),%edi
- # x12 = p
- movl %eax,148(%esp)
- # x6 = t
- movl %edx,124(%esp)
- # p += x8
- addl 132(%esp),%eax
- # x1 = r
- movl %esi,104(%esp)
- # t += x2
- addl 108(%esp),%edx
- # x11 = v
- movl %edi,144(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x3
- addl 112(%esp),%eax
- # p <<<= 7
- rol $7,%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x9
- addl 136(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x4
- movl 116(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x14
- movl 156(%esp),%edi
- # v += w
- add %ebx,%edi
- # p ^= x1
- xorl 104(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x11
- xorl 144(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x6
- xorl 124(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x12
- xorl 148(%esp),%edi
- # x1 = p
- movl %eax,104(%esp)
- # x11 = t
- movl %edx,144(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x6 = r
- movl %esi,124(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x12 = v
- movl %edi,148(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x2
- xorl 108(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x8
- xorl 132(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x7
- xorl 128(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x13
- xorl 152(%esp),%ebx
- # x2 = p
- movl %eax,108(%esp)
- # x8 = t
- movl %edx,132(%esp)
- # p += x1
- addl 104(%esp),%eax
- # x7 = s
- movl %ecx,128(%esp)
- # t += x11
- addl 144(%esp),%edx
- # x13 = w
- movl %ebx,152(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x3
- xorl 112(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x9
- xorl 136(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x4
- xorl 116(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x14
- xorl 156(%esp),%edi
- # x3 = p
- movl %eax,112(%esp)
- # x9 = t
- movl %edx,136(%esp)
- # p += x2
- addl 108(%esp),%eax
- # x4 = r
- movl %esi,116(%esp)
- # t += x8
- addl 132(%esp),%edx
- # x14 = v
- movl %edi,156(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # i -= 4
- sub $4,%ebp
- # goto mainloop if unsigned >
- ja ._mainloop
- # x0 = p
- movl %eax,100(%esp)
- # x5 = s
- movl %ecx,120(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # x15 = w
- movl %ebx,160(%esp)
- # out = out_backup
- movl 72(%esp),%edi
- # m = m_backup
- movl 68(%esp),%esi
- # in0 = x0
- movl 100(%esp),%eax
- # in1 = x1
- movl 104(%esp),%ecx
- # in0 += j0
- addl 164(%esp),%eax
- # in1 += j1
- addl 168(%esp),%ecx
- # in0 ^= *(uint32 *) (m + 0)
- xorl 0(%esi),%eax
- # in1 ^= *(uint32 *) (m + 4)
- xorl 4(%esi),%ecx
- # *(uint32 *) (out + 0) = in0
- movl %eax,0(%edi)
- # *(uint32 *) (out + 4) = in1
- movl %ecx,4(%edi)
- # in2 = x2
- movl 108(%esp),%eax
- # in3 = x3
- movl 112(%esp),%ecx
- # in2 += j2
- addl 172(%esp),%eax
- # in3 += j3
- addl 176(%esp),%ecx
- # in2 ^= *(uint32 *) (m + 8)
- xorl 8(%esi),%eax
- # in3 ^= *(uint32 *) (m + 12)
- xorl 12(%esi),%ecx
- # *(uint32 *) (out + 8) = in2
- movl %eax,8(%edi)
- # *(uint32 *) (out + 12) = in3
- movl %ecx,12(%edi)
- # in4 = x4
- movl 116(%esp),%eax
- # in5 = x5
- movl 120(%esp),%ecx
- # in4 += j4
- addl 180(%esp),%eax
- # in5 += j5
- addl 184(%esp),%ecx
- # in4 ^= *(uint32 *) (m + 16)
- xorl 16(%esi),%eax
- # in5 ^= *(uint32 *) (m + 20)
- xorl 20(%esi),%ecx
- # *(uint32 *) (out + 16) = in4
- movl %eax,16(%edi)
- # *(uint32 *) (out + 20) = in5
- movl %ecx,20(%edi)
- # in6 = x6
- movl 124(%esp),%eax
- # in7 = x7
- movl 128(%esp),%ecx
- # in6 += j6
- addl 188(%esp),%eax
- # in7 += j7
- addl 192(%esp),%ecx
- # in6 ^= *(uint32 *) (m + 24)
- xorl 24(%esi),%eax
- # in7 ^= *(uint32 *) (m + 28)
- xorl 28(%esi),%ecx
- # *(uint32 *) (out + 24) = in6
- movl %eax,24(%edi)
- # *(uint32 *) (out + 28) = in7
- movl %ecx,28(%edi)
- # in8 = x8
- movl 132(%esp),%eax
- # in9 = x9
- movl 136(%esp),%ecx
- # in8 += j8
- addl 196(%esp),%eax
- # in9 += j9
- addl 200(%esp),%ecx
- # in8 ^= *(uint32 *) (m + 32)
- xorl 32(%esi),%eax
- # in9 ^= *(uint32 *) (m + 36)
- xorl 36(%esi),%ecx
- # *(uint32 *) (out + 32) = in8
- movl %eax,32(%edi)
- # *(uint32 *) (out + 36) = in9
- movl %ecx,36(%edi)
- # in10 = x10
- movl 140(%esp),%eax
- # in11 = x11
- movl 144(%esp),%ecx
- # in10 += j10
- addl 204(%esp),%eax
- # in11 += j11
- addl 208(%esp),%ecx
- # in10 ^= *(uint32 *) (m + 40)
- xorl 40(%esi),%eax
- # in11 ^= *(uint32 *) (m + 44)
- xorl 44(%esi),%ecx
- # *(uint32 *) (out + 40) = in10
- movl %eax,40(%edi)
- # *(uint32 *) (out + 44) = in11
- movl %ecx,44(%edi)
- # in12 = x12
- movl 148(%esp),%eax
- # in13 = x13
- movl 152(%esp),%ecx
- # in12 += j12
- addl 212(%esp),%eax
- # in13 += j13
- addl 216(%esp),%ecx
- # in12 ^= *(uint32 *) (m + 48)
- xorl 48(%esi),%eax
- # in13 ^= *(uint32 *) (m + 52)
- xorl 52(%esi),%ecx
- # *(uint32 *) (out + 48) = in12
- movl %eax,48(%edi)
- # *(uint32 *) (out + 52) = in13
- movl %ecx,52(%edi)
- # in14 = x14
- movl 156(%esp),%eax
- # in15 = x15
- movl 160(%esp),%ecx
- # in14 += j14
- addl 220(%esp),%eax
- # in15 += j15
- addl 224(%esp),%ecx
- # in14 ^= *(uint32 *) (m + 56)
- xorl 56(%esi),%eax
- # in15 ^= *(uint32 *) (m + 60)
- xorl 60(%esi),%ecx
- # *(uint32 *) (out + 56) = in14
- movl %eax,56(%edi)
- # *(uint32 *) (out + 60) = in15
- movl %ecx,60(%edi)
- # bytes = bytes_backup
- movl 76(%esp),%ebx
- # in8 = j8
- movl 196(%esp),%eax
- # in9 = j9
- movl 200(%esp),%ecx
- # in8 += 1
- add $1,%eax
- # in9 += 0 + carry
- adc $0,%ecx
- # j8 = in8
- movl %eax,196(%esp)
- # j9 = in9
- movl %ecx,200(%esp)
- # bytes - 64
- cmp $64,%ebx
- # goto bytesatleast65 if unsigned>
- ja ._bytesatleast65
- # goto bytesatleast64 if unsigned>=
- jae ._bytesatleast64
- # m = out
- mov %edi,%esi
- # out = ctarget
- movl 228(%esp),%edi
- # i = bytes
- mov %ebx,%ecx
- # while (i) { *out++ = *m++; --i }
- rep movsb
-._bytesatleast64:
- # x = x_backup
- movl 64(%esp),%eax
- # in8 = j8
- movl 196(%esp),%ecx
- # in9 = j9
- movl 200(%esp),%edx
- # *(uint32 *) (x + 32) = in8
- movl %ecx,32(%eax)
- # *(uint32 *) (x + 36) = in9
- movl %edx,36(%eax)
-._done:
- # eax = eax_stack
- movl 80(%esp),%eax
- # ebx = ebx_stack
- movl 84(%esp),%ebx
- # esi = esi_stack
- movl 88(%esp),%esi
- # edi = edi_stack
- movl 92(%esp),%edi
- # ebp = ebp_stack
- movl 96(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-._bytesatleast65:
- # bytes -= 64
- sub $64,%ebx
- # out += 64
- add $64,%edi
- # m += 64
- add $64,%esi
- # goto bytesatleast1
- jmp ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,64(%esp)
- # ebx_stack = ebx
- movl %ebx,68(%esp)
- # esi_stack = esi
- movl %esi,72(%esp)
- # edi_stack = edi
- movl %edi,76(%esp)
- # ebp_stack = ebp
- movl %ebp,80(%esp)
- # k = arg2
- movl 8(%esp,%eax),%ecx
- # kbits = arg3
- movl 12(%esp,%eax),%edx
- # x = arg1
- movl 4(%esp,%eax),%eax
- # in1 = *(uint32 *) (k + 0)
- movl 0(%ecx),%ebx
- # in2 = *(uint32 *) (k + 4)
- movl 4(%ecx),%esi
- # in3 = *(uint32 *) (k + 8)
- movl 8(%ecx),%edi
- # in4 = *(uint32 *) (k + 12)
- movl 12(%ecx),%ebp
- # *(uint32 *) (x + 4) = in1
- movl %ebx,4(%eax)
- # *(uint32 *) (x + 8) = in2
- movl %esi,8(%eax)
- # *(uint32 *) (x + 12) = in3
- movl %edi,12(%eax)
- # *(uint32 *) (x + 16) = in4
- movl %ebp,16(%eax)
- # kbits - 256
- cmp $256,%edx
- # goto kbits128 if unsigned<
- jb ._kbits128
-._kbits256:
- # in11 = *(uint32 *) (k + 16)
- movl 16(%ecx),%edx
- # in12 = *(uint32 *) (k + 20)
- movl 20(%ecx),%ebx
- # in13 = *(uint32 *) (k + 24)
- movl 24(%ecx),%esi
- # in14 = *(uint32 *) (k + 28)
- movl 28(%ecx),%ecx
- # *(uint32 *) (x + 44) = in11
- movl %edx,44(%eax)
- # *(uint32 *) (x + 48) = in12
- movl %ebx,48(%eax)
- # *(uint32 *) (x + 52) = in13
- movl %esi,52(%eax)
- # *(uint32 *) (x + 56) = in14
- movl %ecx,56(%eax)
- # in0 = 1634760805
- mov $1634760805,%ecx
- # in5 = 857760878
- mov $857760878,%edx
- # in10 = 2036477234
- mov $2036477234,%ebx
- # in15 = 1797285236
- mov $1797285236,%esi
- # *(uint32 *) (x + 0) = in0
- movl %ecx,0(%eax)
- # *(uint32 *) (x + 20) = in5
- movl %edx,20(%eax)
- # *(uint32 *) (x + 40) = in10
- movl %ebx,40(%eax)
- # *(uint32 *) (x + 60) = in15
- movl %esi,60(%eax)
- # goto keysetupdone
- jmp ._keysetupdone
-._kbits128:
- # in11 = *(uint32 *) (k + 0)
- movl 0(%ecx),%edx
- # in12 = *(uint32 *) (k + 4)
- movl 4(%ecx),%ebx
- # in13 = *(uint32 *) (k + 8)
- movl 8(%ecx),%esi
- # in14 = *(uint32 *) (k + 12)
- movl 12(%ecx),%ecx
- # *(uint32 *) (x + 44) = in11
- movl %edx,44(%eax)
- # *(uint32 *) (x + 48) = in12
- movl %ebx,48(%eax)
- # *(uint32 *) (x + 52) = in13
- movl %esi,52(%eax)
- # *(uint32 *) (x + 56) = in14
- movl %ecx,56(%eax)
- # in0 = 1634760805
- mov $1634760805,%ecx
- # in5 = 824206446
- mov $824206446,%edx
- # in10 = 2036477238
- mov $2036477238,%ebx
- # in15 = 1797285236
- mov $1797285236,%esi
- # *(uint32 *) (x + 0) = in0
- movl %ecx,0(%eax)
- # *(uint32 *) (x + 20) = in5
- movl %edx,20(%eax)
- # *(uint32 *) (x + 40) = in10
- movl %ebx,40(%eax)
- # *(uint32 *) (x + 60) = in15
- movl %esi,60(%eax)
-._keysetupdone:
- # eax = eax_stack
- movl 64(%esp),%eax
- # ebx = ebx_stack
- movl 68(%esp),%ebx
- # esi = esi_stack
- movl 72(%esp),%esi
- # edi = edi_stack
- movl 76(%esp),%edi
- # ebp = ebp_stack
- movl 80(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,64(%esp)
- # ebx_stack = ebx
- movl %ebx,68(%esp)
- # esi_stack = esi
- movl %esi,72(%esp)
- # edi_stack = edi
- movl %edi,76(%esp)
- # ebp_stack = ebp
- movl %ebp,80(%esp)
- # iv = arg2
- movl 8(%esp,%eax),%ecx
- # x = arg1
- movl 4(%esp,%eax),%eax
- # in6 = *(uint32 *) (iv + 0)
- movl 0(%ecx),%edx
- # in7 = *(uint32 *) (iv + 4)
- movl 4(%ecx),%ecx
- # in8 = 0
- mov $0,%ebx
- # in9 = 0
- mov $0,%esi
- # *(uint32 *) (x + 24) = in6
- movl %edx,24(%eax)
- # *(uint32 *) (x + 28) = in7
- movl %ecx,28(%eax)
- # *(uint32 *) (x + 32) = in8
- movl %ebx,32(%eax)
- # *(uint32 *) (x + 36) = in9
- movl %esi,36(%eax)
- # eax = eax_stack
- movl 64(%esp),%eax
- # ebx = ebx_stack
- movl 68(%esp),%ebx
- # esi = esi_stack
- movl 72(%esp),%esi
- # edi = edi_stack
- movl 76(%esp),%edi
- # ebp = ebp_stack
- movl 80(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-ENDPROC(salsa20_ivsetup)
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ /dev/null
@@ -1,919 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # x = arg1
- mov %rdi,%r8
- # m = arg2
- mov %rsi,%rsi
- # out = arg3
- mov %rdx,%rdi
- # bytes = arg4
- mov %rcx,%rdx
- # unsigned>? bytes - 0
- cmp $0,%rdx
- # comment:fp stack unchanged by jump
- # goto done if !unsigned>
- jbe ._done
- # comment:fp stack unchanged by fallthrough
-# start:
-._start:
- # r11_stack = r11
- movq %r11,0(%rsp)
- # r12_stack = r12
- movq %r12,8(%rsp)
- # r13_stack = r13
- movq %r13,16(%rsp)
- # r14_stack = r14
- movq %r14,24(%rsp)
- # r15_stack = r15
- movq %r15,32(%rsp)
- # rbx_stack = rbx
- movq %rbx,40(%rsp)
- # rbp_stack = rbp
- movq %rbp,48(%rsp)
- # in0 = *(uint64 *) (x + 0)
- movq 0(%r8),%rcx
- # in2 = *(uint64 *) (x + 8)
- movq 8(%r8),%r9
- # in4 = *(uint64 *) (x + 16)
- movq 16(%r8),%rax
- # in6 = *(uint64 *) (x + 24)
- movq 24(%r8),%r10
- # in8 = *(uint64 *) (x + 32)
- movq 32(%r8),%r11
- # in10 = *(uint64 *) (x + 40)
- movq 40(%r8),%r12
- # in12 = *(uint64 *) (x + 48)
- movq 48(%r8),%r13
- # in14 = *(uint64 *) (x + 56)
- movq 56(%r8),%r14
- # j0 = in0
- movq %rcx,56(%rsp)
- # j2 = in2
- movq %r9,64(%rsp)
- # j4 = in4
- movq %rax,72(%rsp)
- # j6 = in6
- movq %r10,80(%rsp)
- # j8 = in8
- movq %r11,88(%rsp)
- # j10 = in10
- movq %r12,96(%rsp)
- # j12 = in12
- movq %r13,104(%rsp)
- # j14 = in14
- movq %r14,112(%rsp)
- # x_backup = x
- movq %r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
- # unsigned<? bytes - 64
- cmp $64,%rdx
- # comment:fp stack unchanged by jump
- # goto nocopy if !unsigned<
- jae ._nocopy
- # ctarget = out
- movq %rdi,128(%rsp)
- # out = &tmp
- leaq 192(%rsp),%rdi
- # i = bytes
- mov %rdx,%rcx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # out = &tmp
- leaq 192(%rsp),%rdi
- # m = &tmp
- leaq 192(%rsp),%rsi
- # comment:fp stack unchanged by fallthrough
-# nocopy:
-._nocopy:
- # out_backup = out
- movq %rdi,136(%rsp)
- # m_backup = m
- movq %rsi,144(%rsp)
- # bytes_backup = bytes
- movq %rdx,152(%rsp)
- # x1 = j0
- movq 56(%rsp),%rdi
- # x0 = x1
- mov %rdi,%rdx
- # (uint64) x1 >>= 32
- shr $32,%rdi
- # x3 = j2
- movq 64(%rsp),%rsi
- # x2 = x3
- mov %rsi,%rcx
- # (uint64) x3 >>= 32
- shr $32,%rsi
- # x5 = j4
- movq 72(%rsp),%r8
- # x4 = x5
- mov %r8,%r9
- # (uint64) x5 >>= 32
- shr $32,%r8
- # x5_stack = x5
- movq %r8,160(%rsp)
- # x7 = j6
- movq 80(%rsp),%r8
- # x6 = x7
- mov %r8,%rax
- # (uint64) x7 >>= 32
- shr $32,%r8
- # x9 = j8
- movq 88(%rsp),%r10
- # x8 = x9
- mov %r10,%r11
- # (uint64) x9 >>= 32
- shr $32,%r10
- # x11 = j10
- movq 96(%rsp),%r12
- # x10 = x11
- mov %r12,%r13
- # x10_stack = x10
- movq %r13,168(%rsp)
- # (uint64) x11 >>= 32
- shr $32,%r12
- # x13 = j12
- movq 104(%rsp),%r13
- # x12 = x13
- mov %r13,%r14
- # (uint64) x13 >>= 32
- shr $32,%r13
- # x15 = j14
- movq 112(%rsp),%r15
- # x14 = x15
- mov %r15,%rbx
- # (uint64) x15 >>= 32
- shr $32,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # i = 20
- mov $20,%r15
-# mainloop:
-._mainloop:
- # i_backup = i
- movq %r15,184(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x12 + x0
- lea (%r14,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x4 ^= a
- xor %rbp,%r9
- # b = x1 + x5
- lea (%rdi,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x9 ^= b
- xor %rbp,%r10
- # a = x0 + x4
- lea (%rdx,%r9),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x8 ^= a
- xor %rbp,%r11
- # b = x5 + x9
- lea (%r15,%r10),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x13 ^= b
- xor %rbp,%r13
- # a = x4 + x8
- lea (%r9,%r11),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x12 ^= a
- xor %rbp,%r14
- # b = x9 + x13
- lea (%r10,%r13),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x1 ^= b
- xor %rbp,%rdi
- # a = x8 + x12
- lea (%r11,%r14),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x13 + x1
- lea (%r13,%rdi),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x6 + x10
- lea (%rax,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x14 ^= c
- xor %r15,%rbx
- # c = x10 + x14
- lea (%rbp,%rbx),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x2 ^= c
- xor %r15,%rcx
- # c = x14 + x2
- lea (%rbx,%rcx),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x6 ^= c
- xor %r15,%rax
- # c = x2 + x6
- lea (%rcx,%rax),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x11 + x15
- lea (%r12,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x3 ^= d
- xor %rbp,%rsi
- # d = x15 + x3
- lea (%r15,%rsi),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x7 ^= d
- xor %rbp,%r8
- # d = x3 + x7
- lea (%rsi,%r8),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x11 ^= d
- xor %rbp,%r12
- # d = x7 + x11
- lea (%r8,%r12),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x3 + x0
- lea (%rsi,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x1 ^= a
- xor %rbp,%rdi
- # b = x4 + x5
- lea (%r9,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x6 ^= b
- xor %rbp,%rax
- # a = x0 + x1
- lea (%rdx,%rdi),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x2 ^= a
- xor %rbp,%rcx
- # b = x5 + x6
- lea (%r15,%rax),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x7 ^= b
- xor %rbp,%r8
- # a = x1 + x2
- lea (%rdi,%rcx),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x3 ^= a
- xor %rbp,%rsi
- # b = x6 + x7
- lea (%rax,%r8),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x4 ^= b
- xor %rbp,%r9
- # a = x2 + x3
- lea (%rcx,%rsi),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x7 + x4
- lea (%r8,%r9),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x9 + x10
- lea (%r10,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x11 ^= c
- xor %r15,%r12
- # c = x10 + x11
- lea (%rbp,%r12),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x8 ^= c
- xor %r15,%r11
- # c = x11 + x8
- lea (%r12,%r11),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x9 ^= c
- xor %r15,%r10
- # c = x8 + x9
- lea (%r11,%r10),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x14 + x15
- lea (%rbx,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x12 ^= d
- xor %rbp,%r14
- # d = x15 + x12
- lea (%r15,%r14),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x13 ^= d
- xor %rbp,%r13
- # d = x12 + x13
- lea (%r14,%r13),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x14 ^= d
- xor %rbp,%rbx
- # d = x13 + x14
- lea (%r13,%rbx),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x12 + x0
- lea (%r14,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x4 ^= a
- xor %rbp,%r9
- # b = x1 + x5
- lea (%rdi,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x9 ^= b
- xor %rbp,%r10
- # a = x0 + x4
- lea (%rdx,%r9),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x8 ^= a
- xor %rbp,%r11
- # b = x5 + x9
- lea (%r15,%r10),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x13 ^= b
- xor %rbp,%r13
- # a = x4 + x8
- lea (%r9,%r11),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x12 ^= a
- xor %rbp,%r14
- # b = x9 + x13
- lea (%r10,%r13),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x1 ^= b
- xor %rbp,%rdi
- # a = x8 + x12
- lea (%r11,%r14),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x13 + x1
- lea (%r13,%rdi),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x6 + x10
- lea (%rax,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x14 ^= c
- xor %r15,%rbx
- # c = x10 + x14
- lea (%rbp,%rbx),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x2 ^= c
- xor %r15,%rcx
- # c = x14 + x2
- lea (%rbx,%rcx),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x6 ^= c
- xor %r15,%rax
- # c = x2 + x6
- lea (%rcx,%rax),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x11 + x15
- lea (%r12,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x3 ^= d
- xor %rbp,%rsi
- # d = x15 + x3
- lea (%r15,%rsi),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x7 ^= d
- xor %rbp,%r8
- # d = x3 + x7
- lea (%rsi,%r8),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x11 ^= d
- xor %rbp,%r12
- # d = x7 + x11
- lea (%r8,%r12),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x3 + x0
- lea (%rsi,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x1 ^= a
- xor %rbp,%rdi
- # b = x4 + x5
- lea (%r9,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x6 ^= b
- xor %rbp,%rax
- # a = x0 + x1
- lea (%rdx,%rdi),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x2 ^= a
- xor %rbp,%rcx
- # b = x5 + x6
- lea (%r15,%rax),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x7 ^= b
- xor %rbp,%r8
- # a = x1 + x2
- lea (%rdi,%rcx),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x3 ^= a
- xor %rbp,%rsi
- # b = x6 + x7
- lea (%rax,%r8),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x4 ^= b
- xor %rbp,%r9
- # a = x2 + x3
- lea (%rcx,%rsi),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x7 + x4
- lea (%r8,%r9),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x9 + x10
- lea (%r10,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x11 ^= c
- xor %r15,%r12
- # c = x10 + x11
- lea (%rbp,%r12),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x8 ^= c
- xor %r15,%r11
- # c = x11 + x8
- lea (%r12,%r11),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x9 ^= c
- xor %r15,%r10
- # c = x8 + x9
- lea (%r11,%r10),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x14 + x15
- lea (%rbx,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x12 ^= d
- xor %rbp,%r14
- # d = x15 + x12
- lea (%r15,%r14),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x13 ^= d
- xor %rbp,%r13
- # d = x12 + x13
- lea (%r14,%r13),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x14 ^= d
- xor %rbp,%rbx
- # d = x13 + x14
- lea (%r13,%rbx),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # i = i_backup
- movq 184(%rsp),%r15
- # unsigned>? i -= 4
- sub $4,%r15
- # comment:fp stack unchanged by jump
- # goto mainloop if unsigned>
- ja ._mainloop
- # (uint32) x2 += j2
- addl 64(%rsp),%ecx
- # x3 <<= 32
- shl $32,%rsi
- # x3 += j2
- addq 64(%rsp),%rsi
- # (uint64) x3 >>= 32
- shr $32,%rsi
- # x3 <<= 32
- shl $32,%rsi
- # x2 += x3
- add %rsi,%rcx
- # (uint32) x6 += j6
- addl 80(%rsp),%eax
- # x7 <<= 32
- shl $32,%r8
- # x7 += j6
- addq 80(%rsp),%r8
- # (uint64) x7 >>= 32
- shr $32,%r8
- # x7 <<= 32
- shl $32,%r8
- # x6 += x7
- add %r8,%rax
- # (uint32) x8 += j8
- addl 88(%rsp),%r11d
- # x9 <<= 32
- shl $32,%r10
- # x9 += j8
- addq 88(%rsp),%r10
- # (uint64) x9 >>= 32
- shr $32,%r10
- # x9 <<= 32
- shl $32,%r10
- # x8 += x9
- add %r10,%r11
- # (uint32) x12 += j12
- addl 104(%rsp),%r14d
- # x13 <<= 32
- shl $32,%r13
- # x13 += j12
- addq 104(%rsp),%r13
- # (uint64) x13 >>= 32
- shr $32,%r13
- # x13 <<= 32
- shl $32,%r13
- # x12 += x13
- add %r13,%r14
- # (uint32) x0 += j0
- addl 56(%rsp),%edx
- # x1 <<= 32
- shl $32,%rdi
- # x1 += j0
- addq 56(%rsp),%rdi
- # (uint64) x1 >>= 32
- shr $32,%rdi
- # x1 <<= 32
- shl $32,%rdi
- # x0 += x1
- add %rdi,%rdx
- # x5 = x5_stack
- movq 160(%rsp),%rdi
- # (uint32) x4 += j4
- addl 72(%rsp),%r9d
- # x5 <<= 32
- shl $32,%rdi
- # x5 += j4
- addq 72(%rsp),%rdi
- # (uint64) x5 >>= 32
- shr $32,%rdi
- # x5 <<= 32
- shl $32,%rdi
- # x4 += x5
- add %rdi,%r9
- # x10 = x10_stack
- movq 168(%rsp),%r8
- # (uint32) x10 += j10
- addl 96(%rsp),%r8d
- # x11 <<= 32
- shl $32,%r12
- # x11 += j10
- addq 96(%rsp),%r12
- # (uint64) x11 >>= 32
- shr $32,%r12
- # x11 <<= 32
- shl $32,%r12
- # x10 += x11
- add %r12,%r8
- # x15 = x15_stack
- movq 176(%rsp),%rdi
- # (uint32) x14 += j14
- addl 112(%rsp),%ebx
- # x15 <<= 32
- shl $32,%rdi
- # x15 += j14
- addq 112(%rsp),%rdi
- # (uint64) x15 >>= 32
- shr $32,%rdi
- # x15 <<= 32
- shl $32,%rdi
- # x14 += x15
- add %rdi,%rbx
- # out = out_backup
- movq 136(%rsp),%rdi
- # m = m_backup
- movq 144(%rsp),%rsi
- # x0 ^= *(uint64 *) (m + 0)
- xorq 0(%rsi),%rdx
- # *(uint64 *) (out + 0) = x0
- movq %rdx,0(%rdi)
- # x2 ^= *(uint64 *) (m + 8)
- xorq 8(%rsi),%rcx
- # *(uint64 *) (out + 8) = x2
- movq %rcx,8(%rdi)
- # x4 ^= *(uint64 *) (m + 16)
- xorq 16(%rsi),%r9
- # *(uint64 *) (out + 16) = x4
- movq %r9,16(%rdi)
- # x6 ^= *(uint64 *) (m + 24)
- xorq 24(%rsi),%rax
- # *(uint64 *) (out + 24) = x6
- movq %rax,24(%rdi)
- # x8 ^= *(uint64 *) (m + 32)
- xorq 32(%rsi),%r11
- # *(uint64 *) (out + 32) = x8
- movq %r11,32(%rdi)
- # x10 ^= *(uint64 *) (m + 40)
- xorq 40(%rsi),%r8
- # *(uint64 *) (out + 40) = x10
- movq %r8,40(%rdi)
- # x12 ^= *(uint64 *) (m + 48)
- xorq 48(%rsi),%r14
- # *(uint64 *) (out + 48) = x12
- movq %r14,48(%rdi)
- # x14 ^= *(uint64 *) (m + 56)
- xorq 56(%rsi),%rbx
- # *(uint64 *) (out + 56) = x14
- movq %rbx,56(%rdi)
- # bytes = bytes_backup
- movq 152(%rsp),%rdx
- # in8 = j8
- movq 88(%rsp),%rcx
- # in8 += 1
- add $1,%rcx
- # j8 = in8
- movq %rcx,88(%rsp)
- # unsigned>? unsigned<? bytes - 64
- cmp $64,%rdx
- # comment:fp stack unchanged by jump
- # goto bytesatleast65 if unsigned>
- ja ._bytesatleast65
- # comment:fp stack unchanged by jump
- # goto bytesatleast64 if !unsigned<
- jae ._bytesatleast64
- # m = out
- mov %rdi,%rsi
- # out = ctarget
- movq 128(%rsp),%rdi
- # i = bytes
- mov %rdx,%rcx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # comment:fp stack unchanged by fallthrough
-# bytesatleast64:
-._bytesatleast64:
- # x = x_backup
- movq 120(%rsp),%rdi
- # in8 = j8
- movq 88(%rsp),%rsi
- # *(uint64 *) (x + 32) = in8
- movq %rsi,32(%rdi)
- # r11 = r11_stack
- movq 0(%rsp),%r11
- # r12 = r12_stack
- movq 8(%rsp),%r12
- # r13 = r13_stack
- movq 16(%rsp),%r13
- # r14 = r14_stack
- movq 24(%rsp),%r14
- # r15 = r15_stack
- movq 32(%rsp),%r15
- # rbx = rbx_stack
- movq 40(%rsp),%rbx
- # rbp = rbp_stack
- movq 48(%rsp),%rbp
- # comment:fp stack unchanged by fallthrough
-# done:
-._done:
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-# bytesatleast65:
-._bytesatleast65:
- # bytes -= 64
- sub $64,%rdx
- # out += 64
- add $64,%rdi
- # m += 64
- add $64,%rsi
- # comment:fp stack unchanged by jump
- # goto bytesatleast1
- jmp ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # k = arg2
- mov %rsi,%rsi
- # kbits = arg3
- mov %rdx,%rdx
- # x = arg1
- mov %rdi,%rdi
- # in0 = *(uint64 *) (k + 0)
- movq 0(%rsi),%r8
- # in2 = *(uint64 *) (k + 8)
- movq 8(%rsi),%r9
- # *(uint64 *) (x + 4) = in0
- movq %r8,4(%rdi)
- # *(uint64 *) (x + 12) = in2
- movq %r9,12(%rdi)
- # unsigned<? kbits - 256
- cmp $256,%rdx
- # comment:fp stack unchanged by jump
- # goto kbits128 if unsigned<
- jb ._kbits128
-# kbits256:
-._kbits256:
- # in10 = *(uint64 *) (k + 16)
- movq 16(%rsi),%rdx
- # in12 = *(uint64 *) (k + 24)
- movq 24(%rsi),%rsi
- # *(uint64 *) (x + 44) = in10
- movq %rdx,44(%rdi)
- # *(uint64 *) (x + 52) = in12
- movq %rsi,52(%rdi)
- # in0 = 1634760805
- mov $1634760805,%rsi
- # in4 = 857760878
- mov $857760878,%rdx
- # in10 = 2036477234
- mov $2036477234,%rcx
- # in14 = 1797285236
- mov $1797285236,%r8
- # *(uint32 *) (x + 0) = in0
- movl %esi,0(%rdi)
- # *(uint32 *) (x + 20) = in4
- movl %edx,20(%rdi)
- # *(uint32 *) (x + 40) = in10
- movl %ecx,40(%rdi)
- # *(uint32 *) (x + 60) = in14
- movl %r8d,60(%rdi)
- # comment:fp stack unchanged by jump
- # goto keysetupdone
- jmp ._keysetupdone
-# kbits128:
-._kbits128:
- # in10 = *(uint64 *) (k + 0)
- movq 0(%rsi),%rdx
- # in12 = *(uint64 *) (k + 8)
- movq 8(%rsi),%rsi
- # *(uint64 *) (x + 44) = in10
- movq %rdx,44(%rdi)
- # *(uint64 *) (x + 52) = in12
- movq %rsi,52(%rdi)
- # in0 = 1634760805
- mov $1634760805,%rsi
- # in4 = 824206446
- mov $824206446,%rdx
- # in10 = 2036477238
- mov $2036477238,%rcx
- # in14 = 1797285236
- mov $1797285236,%r8
- # *(uint32 *) (x + 0) = in0
- movl %esi,0(%rdi)
- # *(uint32 *) (x + 20) = in4
- movl %edx,20(%rdi)
- # *(uint32 *) (x + 40) = in10
- movl %ecx,40(%rdi)
- # *(uint32 *) (x + 60) = in14
- movl %r8d,60(%rdi)
-# keysetupdone:
-._keysetupdone:
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # iv = arg2
- mov %rsi,%rsi
- # x = arg1
- mov %rdi,%rdi
- # in6 = *(uint64 *) (iv + 0)
- movq 0(%rsi),%rsi
- # in8 = 0
- mov $0,%r8
- # *(uint64 *) (x + 24) = in6
- movq %rsi,24(%rdi)
- # *(uint64 *) (x + 32) = in8
- movq %r8,32(%rdi)
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-ENDPROC(salsa20_ivsetup)
--- a/arch/x86/crypto/salsa20_glue.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Glue code for optimized assembly version of Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeheng@xxxxxxxxx>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <djb@xxxxxxxx>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/algapi.h>
-#include <linux/module.h>
-#include <linux/crypto.h>
-
-#define SALSA20_IV_SIZE 8U
-#define SALSA20_MIN_KEY_SIZE 16U
-#define SALSA20_MAX_KEY_SIZE 32U
-
-struct salsa20_ctx
-{
- u32 input[16];
-};
-
-asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
- u32 keysize, u32 ivsize);
-asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
-asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
- const u8 *src, u8 *dst, u32 bytes);
-
-static int setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keysize)
-{
- struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
- salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
- return 0;
-}
-
-static int encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
-{
- struct blkcipher_walk walk;
- struct crypto_blkcipher *tfm = desc->tfm;
- struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
- int err;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, 64);
-
- salsa20_ivsetup(ctx, walk.iv);
-
- while (walk.nbytes >= 64) {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes - (walk.nbytes % 64));
- err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
- }
-
- if (walk.nbytes) {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr, walk.nbytes);
- err = blkcipher_walk_done(desc, &walk, 0);
- }
-
- return err;
-}
-
-static struct crypto_alg alg = {
- .cra_name = "salsa20",
- .cra_driver_name = "salsa20-asm",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
- .cra_type = &crypto_blkcipher_type,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct salsa20_ctx),
- .cra_alignmask = 3,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .setkey = setkey,
- .encrypt = encrypt,
- .decrypt = encrypt,
- .min_keysize = SALSA20_MIN_KEY_SIZE,
- .max_keysize = SALSA20_MAX_KEY_SIZE,
- .ivsize = SALSA20_IV_SIZE,
- }
- }
-};
-
-static int __init init(void)
-{
- return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
- crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS_CRYPTO("salsa20");
-MODULE_ALIAS_CRYPTO("salsa20-asm");
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1324,32 +1324,6 @@ config CRYPTO_SALSA20
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@xxxxxxxx>. See <http://cr.yp.to/snuffle.html>

-config CRYPTO_SALSA20_586
- tristate "Salsa20 stream cipher algorithm (i586)"
- depends on (X86 || UML_X86) && !64BIT
- select CRYPTO_BLKCIPHER
- help
- Salsa20 stream cipher algorithm.
-
- Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
- Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
- The Salsa20 stream cipher algorithm is designed by Daniel J.
- Bernstein <djb@xxxxxxxx>. See <http://cr.yp.to/snuffle.html>
-
-config CRYPTO_SALSA20_X86_64
- tristate "Salsa20 stream cipher algorithm (x86_64)"
- depends on (X86 || UML_X86) && 64BIT
- select CRYPTO_BLKCIPHER
- help
- Salsa20 stream cipher algorithm.
-
- Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
- Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
- The Salsa20 stream cipher algorithm is designed by Daniel J.
- Bernstein <djb@xxxxxxxx>. See <http://cr.yp.to/snuffle.html>
-
config CRYPTO_CHACHA20
tristate "ChaCha20 cipher algorithm"
select CRYPTO_BLKCIPHER