[PATCH net-next v8 05/28] zinc: import Andy Polyakov's ChaCha20 x86_64 implementation

From: Jason A. Donenfeld
Date: Thu Oct 18 2018 - 10:57:57 EST


These x86_64 vectorized implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed.

While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit cded951378069a478391843f5f8653c1eb5128da

Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
Based-on-code-from: Andy Polyakov <appro@xxxxxxxxxxx>
Cc: Andy Polyakov <appro@xxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: x86@xxxxxxxxxx
Cc: Samuel Neves <sneves@xxxxxxxxx>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: kernel-hardening@xxxxxxxxxxxxxxxxxx
Cc: linux-crypto@xxxxxxxxxxxxxxx
---
.../chacha20/chacha20-x86_64-cryptogams.S | 3433 +++++++++++++++++
1 file changed, 3433 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-x86_64-cryptogams.S

diff --git a/lib/zinc/chacha20/chacha20-x86_64-cryptogams.S b/lib/zinc/chacha20/chacha20-x86_64-cryptogams.S
new file mode 100644
index 000000000000..2bfc76f7e01f
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-cryptogams.S
@@ -0,0 +1,3433 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved.
+ */
+
+.text
+
+
+
+.align 64
+.Lzero:
+.long 0,0,0,0
+.Lone:
+.long 1,0,0,0
+.Linc:
+.long 0,1,2,3
+.Lfour:
+.long 4,4,4,4
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Ltwoy:
+.long 2,0,0,0, 2,0,0,0
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,@function
+.align 64
+ChaCha20_ctr32:
+.cfi_startproc
+ cmpq $0,%rdx
+ je .Lno_data
+ movq OPENSSL_ia32cap_P+4(%rip),%r10
+ btq $48,%r10
+ jc .LChaCha20_avx512
+ testq %r10,%r10
+ js .LChaCha20_avx512vl
+ testl $512,%r10d
+ jnz .LChaCha20_ssse3
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $64+24,%rsp
+.cfi_adjust_cfa_offset 64+24
+.Lctr32_body:
+
+
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lone(%rip),%xmm4
+
+
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq %rdx,%rbp
+ jmp .Loop_outer
+
+.align 32
+.Loop_outer:
+ movl $0x61707865,%eax
+ movl $0x3320646e,%ebx
+ movl $0x79622d32,%ecx
+ movl $0x6b206574,%edx
+ movl 16(%rsp),%r8d
+ movl 20(%rsp),%r9d
+ movl 24(%rsp),%r10d
+ movl 28(%rsp),%r11d
+ movd %xmm3,%r12d
+ movl 52(%rsp),%r13d
+ movl 56(%rsp),%r14d
+ movl 60(%rsp),%r15d
+
+ movq %rbp,64+0(%rsp)
+ movl $10,%ebp
+ movq %rsi,64+8(%rsp)
+.byte 102,72,15,126,214
+ movq %rdi,64+16(%rsp)
+ movq %rsi,%rdi
+ shrq $32,%rdi
+ jmp .Loop
+
+.align 32
+.Loop:
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $16,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $16,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $12,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $12,%r9d
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $8,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $8,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $7,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $7,%r9d
+ movl %esi,32(%rsp)
+ movl %edi,36(%rsp)
+ movl 40(%rsp),%esi
+ movl 44(%rsp),%edi
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $16,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $16,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $12,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $12,%r11d
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $8,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $8,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $7,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $7,%r11d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $16,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $16,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $12,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $12,%r10d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $8,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $8,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $7,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $7,%r10d
+ movl %esi,40(%rsp)
+ movl %edi,44(%rsp)
+ movl 32(%rsp),%esi
+ movl 36(%rsp),%edi
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $16,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $16,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $12,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $12,%r8d
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $8,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $8,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $7,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $7,%r8d
+ decl %ebp
+ jnz .Loop
+ movl %edi,36(%rsp)
+ movl %esi,32(%rsp)
+ movq 64(%rsp),%rbp
+ movdqa %xmm2,%xmm1
+ movq 64+8(%rsp),%rsi
+ paddd %xmm4,%xmm3
+ movq 64+16(%rsp),%rdi
+
+ addl $0x61707865,%eax
+ addl $0x3320646e,%ebx
+ addl $0x79622d32,%ecx
+ addl $0x6b206574,%edx
+ addl 16(%rsp),%r8d
+ addl 20(%rsp),%r9d
+ addl 24(%rsp),%r10d
+ addl 28(%rsp),%r11d
+ addl 48(%rsp),%r12d
+ addl 52(%rsp),%r13d
+ addl 56(%rsp),%r14d
+ addl 60(%rsp),%r15d
+ paddd 32(%rsp),%xmm1
+
+ cmpq $64,%rbp
+ jb .Ltail
+
+ xorl 0(%rsi),%eax
+ xorl 4(%rsi),%ebx
+ xorl 8(%rsi),%ecx
+ xorl 12(%rsi),%edx
+ xorl 16(%rsi),%r8d
+ xorl 20(%rsi),%r9d
+ xorl 24(%rsi),%r10d
+ xorl 28(%rsi),%r11d
+ movdqu 32(%rsi),%xmm0
+ xorl 48(%rsi),%r12d
+ xorl 52(%rsi),%r13d
+ xorl 56(%rsi),%r14d
+ xorl 60(%rsi),%r15d
+ leaq 64(%rsi),%rsi
+ pxor %xmm1,%xmm0
+
+ movdqa %xmm2,32(%rsp)
+ movd %xmm3,48(%rsp)
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movl %r12d,48(%rdi)
+ movl %r13d,52(%rdi)
+ movl %r14d,56(%rdi)
+ movl %r15d,60(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rbp
+ jnz .Loop_outer
+
+ jmp .Ldone
+
+.align 16
+.Ltail:
+ movl %eax,0(%rsp)
+ movl %ebx,4(%rsp)
+ xorq %rbx,%rbx
+ movl %ecx,8(%rsp)
+ movl %edx,12(%rsp)
+ movl %r8d,16(%rsp)
+ movl %r9d,20(%rsp)
+ movl %r10d,24(%rsp)
+ movl %r11d,28(%rsp)
+ movdqa %xmm1,32(%rsp)
+ movl %r12d,48(%rsp)
+ movl %r13d,52(%rsp)
+ movl %r14d,56(%rsp)
+ movl %r15d,60(%rsp)
+
+.Loop_tail:
+ movzbl (%rsi,%rbx,1),%eax
+ movzbl (%rsp,%rbx,1),%edx
+ leaq 1(%rbx),%rbx
+ xorl %edx,%eax
+ movb %al,-1(%rdi,%rbx,1)
+ decq %rbp
+ jnz .Loop_tail
+
+.Ldone:
+ leaq 64+24+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lno_data:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+.type ChaCha20_ssse3,@function
+.align 32
+ChaCha20_ssse3:
+.cfi_startproc
+.LChaCha20_ssse3:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ testl $2048,%r10d
+ jnz .LChaCha20_4xop
+ cmpq $128,%rdx
+ je .LChaCha20_128
+ ja .LChaCha20_4x
+
+.Ldo_sse3_after_all:
+ subq $64+8,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.Lssse3_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_ssse3,.-ChaCha20_ssse3
+.type ChaCha20_128,@function
+.align 32
+ChaCha20_128:
+.cfi_startproc
+.LChaCha20_128:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $64+8,%rsp
+ movdqa .Lsigma(%rip),%xmm8
+ movdqu (%rcx),%xmm9
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lone(%rip),%xmm1
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm8,%xmm10
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,%xmm11
+ movdqa %xmm9,16(%rsp)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm2,32(%rsp)
+ paddd %xmm3,%xmm1
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_128
+
+.align 32
+.Loop_128:
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,222
+.byte 102,15,56,0,206
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $20,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $12,%xmm4
+ psrld $20,%xmm11
+ por %xmm4,%xmm9
+ pslld $12,%xmm5
+ por %xmm5,%xmm11
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,223
+.byte 102,15,56,0,207
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $25,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $7,%xmm4
+ psrld $25,%xmm11
+ por %xmm4,%xmm9
+ pslld $7,%xmm5
+ por %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm9,%xmm9
+ pshufd $147,%xmm3,%xmm3
+ pshufd $78,%xmm0,%xmm0
+ pshufd $57,%xmm11,%xmm11
+ pshufd $147,%xmm1,%xmm1
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,222
+.byte 102,15,56,0,206
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $20,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $12,%xmm4
+ psrld $20,%xmm11
+ por %xmm4,%xmm9
+ pslld $12,%xmm5
+ por %xmm5,%xmm11
+ paddd %xmm9,%xmm8
+ pxor %xmm8,%xmm3
+ paddd %xmm11,%xmm10
+ pxor %xmm10,%xmm1
+.byte 102,15,56,0,223
+.byte 102,15,56,0,207
+ paddd %xmm3,%xmm2
+ paddd %xmm1,%xmm0
+ pxor %xmm2,%xmm9
+ pxor %xmm0,%xmm11
+ movdqa %xmm9,%xmm4
+ psrld $25,%xmm9
+ movdqa %xmm11,%xmm5
+ pslld $7,%xmm4
+ psrld $25,%xmm11
+ por %xmm4,%xmm9
+ pslld $7,%xmm5
+ por %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm9,%xmm9
+ pshufd $57,%xmm3,%xmm3
+ pshufd $78,%xmm0,%xmm0
+ pshufd $147,%xmm11,%xmm11
+ pshufd $57,%xmm1,%xmm1
+ decq %r8
+ jnz .Loop_128
+ paddd 0(%rsp),%xmm8
+ paddd 16(%rsp),%xmm9
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ paddd .Lone(%rip),%xmm1
+ paddd 0(%rsp),%xmm10
+ paddd 16(%rsp),%xmm11
+ paddd 32(%rsp),%xmm0
+ paddd 48(%rsp),%xmm1
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm8
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm9
+ movdqu 48(%rsi),%xmm5
+ pxor %xmm4,%xmm2
+ movdqu 64(%rsi),%xmm4
+ pxor %xmm5,%xmm3
+ movdqu 80(%rsi),%xmm5
+ pxor %xmm4,%xmm10
+ movdqu 96(%rsi),%xmm4
+ pxor %xmm5,%xmm11
+ movdqu 112(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm1
+
+ movdqu %xmm8,0(%rdi)
+ movdqu %xmm9,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ movdqu %xmm10,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm0,96(%rdi)
+ movdqu %xmm1,112(%rdi)
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L128_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_128,.-ChaCha20_128
+.type ChaCha20_4x,@function
+.align 32
+ChaCha20_4x:
+.cfi_startproc
+.LChaCha20_4x:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ movq %r10,%r11
+ shrq $32,%r10
+ testq $32,%r10
+ jnz .LChaCha20_8x
+ cmpq $192,%rdx
+ ja .Lproceed4x
+
+ andq $71303168,%r11
+ cmpq $4194304,%r11
+ je .Ldo_sse3_after_all
+
+.Lproceed4x:
+ subq $0x140+8,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r10),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,199
+.byte 102,15,56,0,207
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,215
+.byte 102,15,56,0,223
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,223
+.byte 102,15,56,0,199
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,222
+.byte 102,15,56,0,198
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r10,%r10
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_4x,.-ChaCha20_4x
+.type ChaCha20_4xop,@function
+.align 32
+ChaCha20_4xop:
+.cfi_startproc
+.LChaCha20_4xop:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $0x140+8,%rsp
+ vzeroupper
+
+ vmovdqa .Lsigma(%rip),%xmm11
+ vmovdqu (%rcx),%xmm3
+ vmovdqu 16(%rcx),%xmm15
+ vmovdqu (%r8),%xmm7
+ leaq 256(%rsp),%rcx
+
+ vpshufd $0x00,%xmm11,%xmm8
+ vpshufd $0x55,%xmm11,%xmm9
+ vmovdqa %xmm8,64(%rsp)
+ vpshufd $0xaa,%xmm11,%xmm10
+ vmovdqa %xmm9,80(%rsp)
+ vpshufd $0xff,%xmm11,%xmm11
+ vmovdqa %xmm10,96(%rsp)
+ vmovdqa %xmm11,112(%rsp)
+
+ vpshufd $0x00,%xmm3,%xmm0
+ vpshufd $0x55,%xmm3,%xmm1
+ vmovdqa %xmm0,128-256(%rcx)
+ vpshufd $0xaa,%xmm3,%xmm2
+ vmovdqa %xmm1,144-256(%rcx)
+ vpshufd $0xff,%xmm3,%xmm3
+ vmovdqa %xmm2,160-256(%rcx)
+ vmovdqa %xmm3,176-256(%rcx)
+
+ vpshufd $0x00,%xmm15,%xmm12
+ vpshufd $0x55,%xmm15,%xmm13
+ vmovdqa %xmm12,192-256(%rcx)
+ vpshufd $0xaa,%xmm15,%xmm14
+ vmovdqa %xmm13,208-256(%rcx)
+ vpshufd $0xff,%xmm15,%xmm15
+ vmovdqa %xmm14,224-256(%rcx)
+ vmovdqa %xmm15,240-256(%rcx)
+
+ vpshufd $0x00,%xmm7,%xmm4
+ vpshufd $0x55,%xmm7,%xmm5
+ vpaddd .Linc(%rip),%xmm4,%xmm4
+ vpshufd $0xaa,%xmm7,%xmm6
+ vmovdqa %xmm5,272-256(%rcx)
+ vpshufd $0xff,%xmm7,%xmm7
+ vmovdqa %xmm6,288-256(%rcx)
+ vmovdqa %xmm7,304-256(%rcx)
+
+ jmp .Loop_enter4xop
+
+.align 32
+.Loop_outer4xop:
+ vmovdqa 64(%rsp),%xmm8
+ vmovdqa 80(%rsp),%xmm9
+ vmovdqa 96(%rsp),%xmm10
+ vmovdqa 112(%rsp),%xmm11
+ vmovdqa 128-256(%rcx),%xmm0
+ vmovdqa 144-256(%rcx),%xmm1
+ vmovdqa 160-256(%rcx),%xmm2
+ vmovdqa 176-256(%rcx),%xmm3
+ vmovdqa 192-256(%rcx),%xmm12
+ vmovdqa 208-256(%rcx),%xmm13
+ vmovdqa 224-256(%rcx),%xmm14
+ vmovdqa 240-256(%rcx),%xmm15
+ vmovdqa 256-256(%rcx),%xmm4
+ vmovdqa 272-256(%rcx),%xmm5
+ vmovdqa 288-256(%rcx),%xmm6
+ vmovdqa 304-256(%rcx),%xmm7
+ vpaddd .Lfour(%rip),%xmm4,%xmm4
+
+.Loop_enter4xop:
+ movl $10,%eax
+ vmovdqa %xmm4,256-256(%rcx)
+ jmp .Loop4xop
+
+.align 32
+.Loop4xop:
+ vpaddd %xmm0,%xmm8,%xmm8
+ vpaddd %xmm1,%xmm9,%xmm9
+ vpaddd %xmm2,%xmm10,%xmm10
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor %xmm4,%xmm8,%xmm4
+ vpxor %xmm5,%xmm9,%xmm5
+ vpxor %xmm6,%xmm10,%xmm6
+ vpxor %xmm7,%xmm11,%xmm7
+.byte 143,232,120,194,228,16
+.byte 143,232,120,194,237,16
+.byte 143,232,120,194,246,16
+.byte 143,232,120,194,255,16
+ vpaddd %xmm4,%xmm12,%xmm12
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm15,%xmm15
+ vpxor %xmm0,%xmm12,%xmm0
+ vpxor %xmm1,%xmm13,%xmm1
+ vpxor %xmm14,%xmm2,%xmm2
+ vpxor %xmm15,%xmm3,%xmm3
+.byte 143,232,120,194,192,12
+.byte 143,232,120,194,201,12
+.byte 143,232,120,194,210,12
+.byte 143,232,120,194,219,12
+ vpaddd %xmm8,%xmm0,%xmm8
+ vpaddd %xmm9,%xmm1,%xmm9
+ vpaddd %xmm2,%xmm10,%xmm10
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor %xmm4,%xmm8,%xmm4
+ vpxor %xmm5,%xmm9,%xmm5
+ vpxor %xmm6,%xmm10,%xmm6
+ vpxor %xmm7,%xmm11,%xmm7
+.byte 143,232,120,194,228,8
+.byte 143,232,120,194,237,8
+.byte 143,232,120,194,246,8
+.byte 143,232,120,194,255,8
+ vpaddd %xmm4,%xmm12,%xmm12
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm15,%xmm15
+ vpxor %xmm0,%xmm12,%xmm0
+ vpxor %xmm1,%xmm13,%xmm1
+ vpxor %xmm14,%xmm2,%xmm2
+ vpxor %xmm15,%xmm3,%xmm3
+.byte 143,232,120,194,192,7
+.byte 143,232,120,194,201,7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,219,7
+ vpaddd %xmm1,%xmm8,%xmm8
+ vpaddd %xmm2,%xmm9,%xmm9
+ vpaddd %xmm3,%xmm10,%xmm10
+ vpaddd %xmm0,%xmm11,%xmm11
+ vpxor %xmm7,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm4
+ vpxor %xmm5,%xmm10,%xmm5
+ vpxor %xmm6,%xmm11,%xmm6
+.byte 143,232,120,194,255,16
+.byte 143,232,120,194,228,16
+.byte 143,232,120,194,237,16
+.byte 143,232,120,194,246,16
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpaddd %xmm4,%xmm15,%xmm15
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpxor %xmm1,%xmm14,%xmm1
+ vpxor %xmm2,%xmm15,%xmm2
+ vpxor %xmm12,%xmm3,%xmm3
+ vpxor %xmm13,%xmm0,%xmm0
+.byte 143,232,120,194,201,12
+.byte 143,232,120,194,210,12
+.byte 143,232,120,194,219,12
+.byte 143,232,120,194,192,12
+ vpaddd %xmm8,%xmm1,%xmm8
+ vpaddd %xmm9,%xmm2,%xmm9
+ vpaddd %xmm3,%xmm10,%xmm10
+ vpaddd %xmm0,%xmm11,%xmm11
+ vpxor %xmm7,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm4
+ vpxor %xmm5,%xmm10,%xmm5
+ vpxor %xmm6,%xmm11,%xmm6
+.byte 143,232,120,194,255,8
+.byte 143,232,120,194,228,8
+.byte 143,232,120,194,237,8
+.byte 143,232,120,194,246,8
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpaddd %xmm4,%xmm15,%xmm15
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpxor %xmm1,%xmm14,%xmm1
+ vpxor %xmm2,%xmm15,%xmm2
+ vpxor %xmm12,%xmm3,%xmm3
+ vpxor %xmm13,%xmm0,%xmm0
+.byte 143,232,120,194,201,7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,192,7
+ decl %eax
+ jnz .Loop4xop
+
+ vpaddd 64(%rsp),%xmm8,%xmm8
+ vpaddd 80(%rsp),%xmm9,%xmm9
+ vpaddd 96(%rsp),%xmm10,%xmm10
+ vpaddd 112(%rsp),%xmm11,%xmm11
+
+ vmovdqa %xmm14,32(%rsp)
+ vmovdqa %xmm15,48(%rsp)
+
+ vpunpckldq %xmm9,%xmm8,%xmm14
+ vpunpckldq %xmm11,%xmm10,%xmm15
+ vpunpckhdq %xmm9,%xmm8,%xmm8
+ vpunpckhdq %xmm11,%xmm10,%xmm10
+ vpunpcklqdq %xmm15,%xmm14,%xmm9
+ vpunpckhqdq %xmm15,%xmm14,%xmm14
+ vpunpcklqdq %xmm10,%xmm8,%xmm11
+ vpunpckhqdq %xmm10,%xmm8,%xmm8
+ vpaddd 128-256(%rcx),%xmm0,%xmm0
+ vpaddd 144-256(%rcx),%xmm1,%xmm1
+ vpaddd 160-256(%rcx),%xmm2,%xmm2
+ vpaddd 176-256(%rcx),%xmm3,%xmm3
+
+ vmovdqa %xmm9,0(%rsp)
+ vmovdqa %xmm14,16(%rsp)
+ vmovdqa 32(%rsp),%xmm9
+ vmovdqa 48(%rsp),%xmm14
+
+ vpunpckldq %xmm1,%xmm0,%xmm10
+ vpunpckldq %xmm3,%xmm2,%xmm15
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm15,%xmm10,%xmm1
+ vpunpckhqdq %xmm15,%xmm10,%xmm10
+ vpunpcklqdq %xmm2,%xmm0,%xmm3
+ vpunpckhqdq %xmm2,%xmm0,%xmm0
+ vpaddd 192-256(%rcx),%xmm12,%xmm12
+ vpaddd 208-256(%rcx),%xmm13,%xmm13
+ vpaddd 224-256(%rcx),%xmm9,%xmm9
+ vpaddd 240-256(%rcx),%xmm14,%xmm14
+
+ vpunpckldq %xmm13,%xmm12,%xmm2
+ vpunpckldq %xmm14,%xmm9,%xmm15
+ vpunpckhdq %xmm13,%xmm12,%xmm12
+ vpunpckhdq %xmm14,%xmm9,%xmm9
+ vpunpcklqdq %xmm15,%xmm2,%xmm13
+ vpunpckhqdq %xmm15,%xmm2,%xmm2
+ vpunpcklqdq %xmm9,%xmm12,%xmm14
+ vpunpckhqdq %xmm9,%xmm12,%xmm12
+ vpaddd 256-256(%rcx),%xmm4,%xmm4
+ vpaddd 272-256(%rcx),%xmm5,%xmm5
+ vpaddd 288-256(%rcx),%xmm6,%xmm6
+ vpaddd 304-256(%rcx),%xmm7,%xmm7
+
+ vpunpckldq %xmm5,%xmm4,%xmm9
+ vpunpckldq %xmm7,%xmm6,%xmm15
+ vpunpckhdq %xmm5,%xmm4,%xmm4
+ vpunpckhdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm15,%xmm9,%xmm5
+ vpunpckhqdq %xmm15,%xmm9,%xmm9
+ vpunpcklqdq %xmm6,%xmm4,%xmm7
+ vpunpckhqdq %xmm6,%xmm4,%xmm4
+ vmovdqa 0(%rsp),%xmm6
+ vmovdqa 16(%rsp),%xmm15
+
+ cmpq $256,%rdx
+ jb .Ltail4xop
+
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ vpxor 0(%rsi),%xmm11,%xmm11
+ vpxor 16(%rsi),%xmm3,%xmm3
+ vpxor 32(%rsi),%xmm14,%xmm14
+ vpxor 48(%rsi),%xmm7,%xmm7
+ vpxor 64(%rsi),%xmm8,%xmm8
+ vpxor 80(%rsi),%xmm0,%xmm0
+ vpxor 96(%rsi),%xmm12,%xmm12
+ vpxor 112(%rsi),%xmm4,%xmm4
+ leaq 128(%rsi),%rsi
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ leaq 128(%rdi),%rdi
+ vmovdqu %xmm11,0(%rdi)
+ vmovdqu %xmm3,16(%rdi)
+ vmovdqu %xmm14,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ vmovdqu %xmm8,64(%rdi)
+ vmovdqu %xmm0,80(%rdi)
+ vmovdqu %xmm12,96(%rdi)
+ vmovdqu %xmm4,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4xop
+
+ jmp .Ldone4xop
+
+.align 32
+.Ltail4xop:
+ cmpq $192,%rdx
+ jae .L192_or_more4xop
+ cmpq $128,%rdx
+ jae .L128_or_more4xop
+ cmpq $64,%rdx
+ jae .L64_or_more4xop
+
+ xorq %r10,%r10
+ vmovdqa %xmm6,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm13,32(%rsp)
+ vmovdqa %xmm5,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L64_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ je .Ldone4xop
+
+ leaq 64(%rsi),%rsi
+ vmovdqa %xmm15,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm10,16(%rsp)
+ leaq 64(%rdi),%rdi
+ vmovdqa %xmm2,32(%rsp)
+ subq $64,%rdx
+ vmovdqa %xmm9,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L128_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ je .Ldone4xop
+
+ leaq 128(%rsi),%rsi
+ vmovdqa %xmm11,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm3,16(%rsp)
+ leaq 128(%rdi),%rdi
+ vmovdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ vmovdqa %xmm7,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L192_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ vpxor 0(%rsi),%xmm11,%xmm11
+ vpxor 16(%rsi),%xmm3,%xmm3
+ vpxor 32(%rsi),%xmm14,%xmm14
+ vpxor 48(%rsi),%xmm7,%xmm7
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ leaq 128(%rdi),%rdi
+ vmovdqu %xmm11,0(%rdi)
+ vmovdqu %xmm3,16(%rdi)
+ vmovdqu %xmm14,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ je .Ldone4xop
+
+ leaq 64(%rsi),%rsi
+ vmovdqa %xmm8,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm0,16(%rsp)
+ leaq 64(%rdi),%rdi
+ vmovdqa %xmm12,32(%rsp)
+ subq $192,%rdx
+ vmovdqa %xmm4,48(%rsp)
+
+.Loop_tail4xop:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4xop
+
+.Ldone4xop:
+ vzeroupper
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L4xop_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_4xop,.-ChaCha20_4xop
+.type ChaCha20_8x,@function
+.align 32
+ChaCha20_8x:
+.cfi_startproc
+.LChaCha20_8x:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_8x,.-ChaCha20_8x
+.type ChaCha20_avx512,@function
+.align 32
+ChaCha20_avx512:
+.cfi_startproc
+.LChaCha20_avx512:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ cmpq $512,%rdx
+ ja .LChaCha20_16x
+
+ subq $64+8,%rsp
+ vbroadcasti32x4 .Lsigma(%rip),%zmm0
+ vbroadcasti32x4 (%rcx),%zmm1
+ vbroadcasti32x4 16(%rcx),%zmm2
+ vbroadcasti32x4 (%r8),%zmm3
+
+ vmovdqa32 %zmm0,%zmm16
+ vmovdqa32 %zmm1,%zmm17
+ vmovdqa32 %zmm2,%zmm18
+ vpaddd .Lzeroz(%rip),%zmm3,%zmm3
+ vmovdqa32 .Lfourz(%rip),%zmm20
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+ vmovdqa32 %zmm16,%zmm0
+ vmovdqa32 %zmm17,%zmm1
+ vmovdqa32 %zmm18,%zmm2
+ vpaddd %zmm20,%zmm19,%zmm3
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $57,%zmm1,%zmm1
+ vpshufd $147,%zmm3,%zmm3
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $147,%zmm1,%zmm1
+ vpshufd $57,%zmm3,%zmm3
+ decq %r8
+ jnz .Loop_avx512
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $1,%zmm0,%xmm4
+ vextracti32x4 $1,%zmm1,%xmm5
+ vextracti32x4 $1,%zmm2,%xmm6
+ vextracti32x4 $1,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $2,%zmm0,%xmm4
+ vextracti32x4 $2,%zmm1,%xmm5
+ vextracti32x4 $2,%zmm2,%xmm6
+ vextracti32x4 $2,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $3,%zmm0,%xmm4
+ vextracti32x4 $3,%zmm1,%xmm5
+ vextracti32x4 $3,%zmm2,%xmm6
+ vextracti32x4 $3,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jnz .Loop_outer_avx512
+
+ jmp .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512
+
+ vmovdqu32 %zmm16,0(%rsp)
+
+.Ldone_avx512:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.Lavx512_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_avx512,.-ChaCha20_avx512
+.type ChaCha20_avx512vl,@function
+.align 32
+ChaCha20_avx512vl:
+.cfi_startproc
+.LChaCha20_avx512vl:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ cmpq $128,%rdx
+ ja .LChaCha20_8xvl
+
+ subq $64+8,%rsp
+ vbroadcasti128 .Lsigma(%rip),%ymm0
+ vbroadcasti128 (%rcx),%ymm1
+ vbroadcasti128 16(%rcx),%ymm2
+ vbroadcasti128 (%r8),%ymm3
+
+ vmovdqa32 %ymm0,%ymm16
+ vmovdqa32 %ymm1,%ymm17
+ vmovdqa32 %ymm2,%ymm18
+ vpaddd .Lzeroz(%rip),%ymm3,%ymm3
+ vmovdqa32 .Ltwoy(%rip),%ymm20
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 16
+.Loop_outer_avx512vl:
+ vmovdqa32 %ymm18,%ymm2
+ vpaddd %ymm20,%ymm19,%ymm3
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $57,%ymm1,%ymm1
+ vpshufd $147,%ymm3,%ymm3
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $147,%ymm1,%ymm1
+ vpshufd $57,%ymm3,%ymm3
+ decq %r8
+ jnz .Loop_avx512vl
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512vl
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512vl
+
+ vextracti128 $1,%ymm0,%xmm4
+ vextracti128 $1,%ymm1,%xmm5
+ vextracti128 $1,%ymm2,%xmm6
+ vextracti128 $1,%ymm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512vl
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ vmovdqa32 %ymm16,%ymm0
+ vmovdqa32 %ymm17,%ymm1
+ jnz .Loop_outer_avx512vl
+
+ jmp .Ldone_avx512vl
+
+.align 16
+.Ltail64_avx512vl:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512vl
+
+.align 16
+.Ltail_avx512vl:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512vl:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512vl
+
+ vmovdqu32 %ymm16,0(%rsp)
+ vmovdqu32 %ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.Lavx512vl_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_avx512vl,.-ChaCha20_avx512vl
+.type ChaCha20_16x,@function
+.align 32
+ChaCha20_16x:
+.cfi_startproc
+.LChaCha20_16x:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r10
+ vbroadcasti32x4 (%r10),%zmm3
+ vbroadcasti32x4 (%rcx),%zmm7
+ vbroadcasti32x4 16(%rcx),%zmm11
+ vbroadcasti32x4 (%r8),%zmm15
+
+ vpshufd $0x00,%zmm3,%zmm0
+ vpshufd $0x55,%zmm3,%zmm1
+ vpshufd $0xaa,%zmm3,%zmm2
+ vpshufd $0xff,%zmm3,%zmm3
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ vpshufd $0x00,%zmm7,%zmm4
+ vpshufd $0x55,%zmm7,%zmm5
+ vpshufd $0xaa,%zmm7,%zmm6
+ vpshufd $0xff,%zmm7,%zmm7
+ vmovdqa64 %zmm4,%zmm20
+ vmovdqa64 %zmm5,%zmm21
+ vmovdqa64 %zmm6,%zmm22
+ vmovdqa64 %zmm7,%zmm23
+
+ vpshufd $0x00,%zmm11,%zmm8
+ vpshufd $0x55,%zmm11,%zmm9
+ vpshufd $0xaa,%zmm11,%zmm10
+ vpshufd $0xff,%zmm11,%zmm11
+ vmovdqa64 %zmm8,%zmm24
+ vmovdqa64 %zmm9,%zmm25
+ vmovdqa64 %zmm10,%zmm26
+ vmovdqa64 %zmm11,%zmm27
+
+ vpshufd $0x00,%zmm15,%zmm12
+ vpshufd $0x55,%zmm15,%zmm13
+ vpshufd $0xaa,%zmm15,%zmm14
+ vpshufd $0xff,%zmm15,%zmm15
+ vpaddd .Lincz(%rip),%zmm12,%zmm12
+ vmovdqa64 %zmm12,%zmm28
+ vmovdqa64 %zmm13,%zmm29
+ vmovdqa64 %zmm14,%zmm30
+ vmovdqa64 %zmm15,%zmm31
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop_outer16x:
+ vpbroadcastd 0(%r10),%zmm0
+ vpbroadcastd 4(%r10),%zmm1
+ vpbroadcastd 8(%r10),%zmm2
+ vpbroadcastd 12(%r10),%zmm3
+ vpaddd .Lsixteen(%rip),%zmm28,%zmm28
+ vmovdqa64 %zmm20,%zmm4
+ vmovdqa64 %zmm21,%zmm5
+ vmovdqa64 %zmm22,%zmm6
+ vmovdqa64 %zmm23,%zmm7
+ vmovdqa64 %zmm24,%zmm8
+ vmovdqa64 %zmm25,%zmm9
+ vmovdqa64 %zmm26,%zmm10
+ vmovdqa64 %zmm27,%zmm11
+ vmovdqa64 %zmm28,%zmm12
+ vmovdqa64 %zmm29,%zmm13
+ vmovdqa64 %zmm30,%zmm14
+ vmovdqa64 %zmm31,%zmm15
+
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop16x:
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ decl %eax
+ jnz .Loop16x
+
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ vpunpckldq %zmm1,%zmm0,%zmm18
+ vpunpckldq %zmm3,%zmm2,%zmm19
+ vpunpckhdq %zmm1,%zmm0,%zmm0
+ vpunpckhdq %zmm3,%zmm2,%zmm2
+ vpunpcklqdq %zmm19,%zmm18,%zmm1
+ vpunpckhqdq %zmm19,%zmm18,%zmm18
+ vpunpcklqdq %zmm2,%zmm0,%zmm3
+ vpunpckhqdq %zmm2,%zmm0,%zmm0
+ vpaddd %zmm20,%zmm4,%zmm4
+ vpaddd %zmm21,%zmm5,%zmm5
+ vpaddd %zmm22,%zmm6,%zmm6
+ vpaddd %zmm23,%zmm7,%zmm7
+
+ vpunpckldq %zmm5,%zmm4,%zmm2
+ vpunpckldq %zmm7,%zmm6,%zmm19
+ vpunpckhdq %zmm5,%zmm4,%zmm4
+ vpunpckhdq %zmm7,%zmm6,%zmm6
+ vpunpcklqdq %zmm19,%zmm2,%zmm5
+ vpunpckhqdq %zmm19,%zmm2,%zmm2
+ vpunpcklqdq %zmm6,%zmm4,%zmm7
+ vpunpckhqdq %zmm6,%zmm4,%zmm4
+ vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19
+ vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
+ vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
+ vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
+ vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
+ vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
+ vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
+ vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
+ vpaddd %zmm24,%zmm8,%zmm8
+ vpaddd %zmm25,%zmm9,%zmm9
+ vpaddd %zmm26,%zmm10,%zmm10
+ vpaddd %zmm27,%zmm11,%zmm11
+
+ vpunpckldq %zmm9,%zmm8,%zmm6
+ vpunpckldq %zmm11,%zmm10,%zmm0
+ vpunpckhdq %zmm9,%zmm8,%zmm8
+ vpunpckhdq %zmm11,%zmm10,%zmm10
+ vpunpcklqdq %zmm0,%zmm6,%zmm9
+ vpunpckhqdq %zmm0,%zmm6,%zmm6
+ vpunpcklqdq %zmm10,%zmm8,%zmm11
+ vpunpckhqdq %zmm10,%zmm8,%zmm8
+ vpaddd %zmm28,%zmm12,%zmm12
+ vpaddd %zmm29,%zmm13,%zmm13
+ vpaddd %zmm30,%zmm14,%zmm14
+ vpaddd %zmm31,%zmm15,%zmm15
+
+ vpunpckldq %zmm13,%zmm12,%zmm10
+ vpunpckldq %zmm15,%zmm14,%zmm0
+ vpunpckhdq %zmm13,%zmm12,%zmm12
+ vpunpckhdq %zmm15,%zmm14,%zmm14
+ vpunpcklqdq %zmm0,%zmm10,%zmm13
+ vpunpckhqdq %zmm0,%zmm10,%zmm10
+ vpunpcklqdq %zmm14,%zmm12,%zmm15
+ vpunpckhqdq %zmm14,%zmm12,%zmm12
+ vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0
+ vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
+ vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
+ vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
+ vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
+ vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
+ vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
+ vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
+ vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16
+ vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
+ vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
+ vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
+ vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
+ vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
+ vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
+ vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
+ vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
+ vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
+ vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
+ vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
+ vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
+ vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
+ vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
+ vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
+ cmpq $1024,%rdx
+ jb .Ltail16x
+
+ vpxord 0(%rsi),%zmm16,%zmm16
+ vpxord 64(%rsi),%zmm17,%zmm17
+ vpxord 128(%rsi),%zmm14,%zmm14
+ vpxord 192(%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm16,0(%rdi)
+ vmovdqu32 %zmm17,64(%rdi)
+ vmovdqu32 %zmm14,128(%rdi)
+ vmovdqu32 %zmm8,192(%rdi)
+
+ vpxord 256(%rsi),%zmm19,%zmm19
+ vpxord 320(%rsi),%zmm1,%zmm1
+ vpxord 384(%rsi),%zmm18,%zmm18
+ vpxord 448(%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm19,256(%rdi)
+ vmovdqu32 %zmm1,320(%rdi)
+ vmovdqu32 %zmm18,384(%rdi)
+ vmovdqu32 %zmm3,448(%rdi)
+
+ vpxord 512(%rsi),%zmm0,%zmm0
+ vpxord 576(%rsi),%zmm9,%zmm9
+ vpxord 640(%rsi),%zmm6,%zmm6
+ vpxord 704(%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm0,512(%rdi)
+ vmovdqu32 %zmm9,576(%rdi)
+ vmovdqu32 %zmm6,640(%rdi)
+ vmovdqu32 %zmm11,704(%rdi)
+
+ vpxord 768(%rsi),%zmm13,%zmm13
+ vpxord 832(%rsi),%zmm10,%zmm10
+ vpxord 896(%rsi),%zmm15,%zmm15
+ vpxord 960(%rsi),%zmm12,%zmm12
+ leaq 1024(%rsi),%rsi
+ vmovdqu32 %zmm13,768(%rdi)
+ vmovdqu32 %zmm10,832(%rdi)
+ vmovdqu32 %zmm15,896(%rdi)
+ vmovdqu32 %zmm12,960(%rdi)
+ leaq 1024(%rdi),%rdi
+
+ subq $1024,%rdx
+ jnz .Loop_outer16x
+
+ jmp .Ldone16x
+
+.align 32
+.Ltail16x:
+ xorq %r10,%r10
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm16,%zmm16
+ vmovdqu32 %zmm16,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm17,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm17,%zmm17
+ vmovdqu32 %zmm17,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm14,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm14,%zmm14
+ vmovdqu32 %zmm14,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm8,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm8,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm19,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm19,%zmm19
+ vmovdqu32 %zmm19,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm1,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm1,%zmm1
+ vmovdqu32 %zmm1,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm18,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm18,%zmm18
+ vmovdqu32 %zmm18,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm3,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $512,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm3,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm0,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $576,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm0,%zmm0
+ vmovdqu32 %zmm0,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm9,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $640,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm9,%zmm9
+ vmovdqu32 %zmm9,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm6,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $704,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm6,%zmm6
+ vmovdqu32 %zmm6,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm11,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $768,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm11,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm13,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $832,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm13,%zmm13
+ vmovdqu32 %zmm13,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm10,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $896,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm10,%zmm10
+ vmovdqu32 %zmm10,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm15,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $960,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm15,%zmm15
+ vmovdqu32 %zmm15,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm12,%zmm16
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_16x:
+ vmovdqa32 %zmm16,0(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail16x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail16x
+
+ vpxord %zmm16,%zmm16,%zmm16
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone16x:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L16x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_16x,.-ChaCha20_16x
+.type ChaCha20_8xvl,@function
+.align 32
+ChaCha20_8xvl:
+.cfi_startproc
+.LChaCha20_8xvl:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r10
+ vbroadcasti128 (%r10),%ymm3
+ vbroadcasti128 (%rcx),%ymm7
+ vbroadcasti128 16(%rcx),%ymm11
+ vbroadcasti128 (%r8),%ymm15
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vpshufd $0xaa,%ymm3,%ymm2
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpshufd $0xaa,%ymm7,%ymm6
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vpshufd $0xaa,%ymm11,%ymm10
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vpshufd $0xaa,%ymm15,%ymm14
+ vpshufd $0xff,%ymm15,%ymm15
+ vpaddd .Lincy(%rip),%ymm12,%ymm12
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop_outer8xvl:
+
+
+ vpbroadcastd 8(%r10),%ymm2
+ vpbroadcastd 12(%r10),%ymm3
+ vpaddd .Leight(%rip),%ymm28,%ymm28
+ vmovdqa64 %ymm20,%ymm4
+ vmovdqa64 %ymm21,%ymm5
+ vmovdqa64 %ymm22,%ymm6
+ vmovdqa64 %ymm23,%ymm7
+ vmovdqa64 %ymm24,%ymm8
+ vmovdqa64 %ymm25,%ymm9
+ vmovdqa64 %ymm26,%ymm10
+ vmovdqa64 %ymm27,%ymm11
+ vmovdqa64 %ymm28,%ymm12
+ vmovdqa64 %ymm29,%ymm13
+ vmovdqa64 %ymm30,%ymm14
+ vmovdqa64 %ymm31,%ymm15
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop8xvl:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ decl %eax
+ jnz .Loop8xvl
+
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm18
+ vpunpckldq %ymm3,%ymm2,%ymm19
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm19,%ymm18,%ymm1
+ vpunpckhqdq %ymm19,%ymm18,%ymm18
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm2
+ vpunpckldq %ymm7,%ymm6,%ymm19
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm19,%ymm2,%ymm5
+ vpunpckhqdq %ymm19,%ymm2,%ymm2
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vshufi32x4 $0,%ymm5,%ymm1,%ymm19
+ vshufi32x4 $3,%ymm5,%ymm1,%ymm5
+ vshufi32x4 $0,%ymm2,%ymm18,%ymm1
+ vshufi32x4 $3,%ymm2,%ymm18,%ymm2
+ vshufi32x4 $0,%ymm7,%ymm3,%ymm18
+ vshufi32x4 $3,%ymm7,%ymm3,%ymm7
+ vshufi32x4 $0,%ymm4,%ymm0,%ymm3
+ vshufi32x4 $3,%ymm4,%ymm0,%ymm4
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm6
+ vpunpckldq %ymm11,%ymm10,%ymm0
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm0,%ymm6,%ymm9
+ vpunpckhqdq %ymm0,%ymm6,%ymm6
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ vpunpckldq %ymm13,%ymm12,%ymm10
+ vpunpckldq %ymm15,%ymm14,%ymm0
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm0,%ymm10,%ymm13
+ vpunpckhqdq %ymm0,%ymm10,%ymm10
+ vpunpcklqdq %ymm14,%ymm12,%ymm15
+ vpunpckhqdq %ymm14,%ymm12,%ymm12
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ cmpq $512,%rdx
+ jb .Ltail8xvl
+
+ movl $0x80,%eax
+ vpxord 0(%rsi),%ymm19,%ymm19
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vpxor 64(%rsi),%ymm5,%ymm5
+ vpxor 96(%rsi),%ymm13,%ymm13
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm19,0(%rdi)
+ vmovdqu %ymm0,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm13,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm2,%ymm2
+ vpxor 96(%rsi),%ymm10,%ymm10
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm1,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm10,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vpxor 64(%rsi),%ymm7,%ymm7
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm18,0(%rdi)
+ vmovdqu %ymm6,32(%rdi)
+ vmovdqu %ymm7,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vpxor 64(%rsi),%ymm4,%ymm4
+ vpxor 96(%rsi),%ymm12,%ymm12
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vmovdqu %ymm12,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpbroadcastd 0(%r10),%ymm0
+ vpbroadcastd 4(%r10),%ymm1
+
+ subq $512,%rdx
+ jnz .Loop_outer8xvl
+
+ jmp .Ldone8xvl
+
+.align 32
+.Ltail8xvl:
+ vmovdqa64 %ymm19,%ymm8
+ xorq %r10,%r10
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm8,%ymm8
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vmovdqu %ymm8,0(%rdi,%rsi,1)
+ vmovdqu %ymm0,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm5,%ymm8
+ vmovdqa %ymm13,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm5,%ymm5
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm5,0(%rdi,%rsi,1)
+ vmovdqu %ymm13,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm1,%ymm8
+ vmovdqa %ymm9,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm1,0(%rdi,%rsi,1)
+ vmovdqu %ymm9,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm2,%ymm8
+ vmovdqa %ymm10,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm2,%ymm2
+ vpxor 32(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm2,0(%rdi,%rsi,1)
+ vmovdqu %ymm10,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa32 %ymm18,%ymm8
+ vmovdqa %ymm6,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_8xvl
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vmovdqu32 %ymm18,0(%rdi,%rsi,1)
+ vmovdqu %ymm6,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm7,%ymm8
+ vmovdqa %ymm15,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm7,%ymm7
+ vpxor 32(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm7,0(%rdi,%rsi,1)
+ vmovdqu %ymm15,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm3,%ymm8
+ vmovdqa %ymm11,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi,%rsi,1)
+ vmovdqu %ymm11,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm4,%ymm8
+ vmovdqa %ymm12,%ymm0
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm0,32(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail8xvl:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8xvl
+
+ vpxor %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+
+.Ldone8xvl:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L8xvl_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_8xvl,.-ChaCha20_8xvl
--
2.19.1