[PATCH net-next v8 13/28] zinc: Poly1305 x86_64 implementation

From: Jason A. Donenfeld
Date: Thu Oct 18 2018 - 10:58:20 EST


This ports AVX, AVX-2, and AVX-512F implementations for Poly1305.
The AVX-512F implementation is disabled on Skylake, due to throttling.
These come from Andy Polyakov's implementation, with the following
modifications from Samuel Neves:

- Some cosmetic changes, like renaming labels to .Lname, constants,
and other Linux conventions.

- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.

- poly1305_blocks_avx512 jumped to the middle of the poly1305_blocks_avx2
for the final blocks. To appease objtool, the relevant tail avx2 code
was duplicated for the avx512 function.

- The original uses %rbp as a scratch register. However, the kernel
expects %rbp to be a valid frame pointer at any given time in order
to do proper unwinding. Thus we need to alter the code in order to
preserve it. The most straightforward manner in which this was
accomplished was by replacing $d3, formerly %r10, by %rdi, and
replacing %rbp by %r10. Because %rdi, a pointer to the context
structure, does not change and is not used by poly1305_iteration,
it is safe to use it here, and the overhead of saving and restoring
it should be minimal.

- The original hardcodes returns as .byte 0xf3,0xc3, aka "rep ret".
We replace this by "ret". "rep ret" was meant to help with AMD K8
chips, cf. http://repzret.org/p/repzret. It makes no sense to
continue to use this kludge for code that won't even run on ancient
AMD chips.

The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
the unfortunate situation of using AVX and then having to go back to scalar
-- because the user is silly and has called the update function from two
separate contexts -- then we need to convert back to the original base before
proceeding. It is possible to reason that the initial reduction below is
sufficient given the implementation invariants. However, for an avoidance of
doubt and because this is not performance critical, we do the full reduction
anyway. This conversion is found in the glue code, and a proof of
correctness may be easily obtained from Z3: <https://xn--4db.cc/ltPtHCKN/py>.

Cycle counts on a Core i7 6700HQ using the AVX-2 codepath, comparing
this implementation ("new") to the implementation in the current crypto
api ("old"):

size old new
---- ---- ----
0 70 68
16 92 90
32 134 104
48 172 120
64 218 136
80 254 158
96 298 174
112 342 192
128 388 212
144 428 228
160 466 246
176 510 264
192 550 282
208 594 302
224 628 316
240 676 334
256 716 354
272 764 374
288 802 352
304 420 366
320 428 360
336 484 378
352 426 384
368 478 400
384 488 394
400 542 408
416 486 416
432 534 430
448 544 422
464 600 438
480 540 448
496 594 464
512 602 456
528 656 476
544 600 480
560 650 494
576 664 490
592 714 508
608 656 514
624 708 532
640 716 524
656 770 536
672 716 548
688 770 562
704 774 552
720 826 568
736 768 574
752 822 592
768 830 584
784 884 602
800 828 610
816 884 628
832 888 618
848 942 632
864 884 644
880 936 660
896 948 652
912 1000 664
928 942 676
944 994 690
960 1002 680
976 1054 694
992 1002 706
1008 1052 720

Cycle counts on a Xeon Gold 5120 using the AVX-512 codepath:

size old new
---- ---- ----
0 74 70
16 96 92
32 136 106
48 184 124
64 218 138
80 260 160
96 300 176
112 342 194
128 384 212
144 420 226
160 464 248
176 504 264
192 544 282
208 582 300
224 624 318
240 662 338
256 708 358
272 748 372
288 788 358
304 422 370
320 432 364
336 486 380
352 434 390
368 480 408
384 490 398
400 542 412
416 492 426
432 538 436
448 546 432
464 600 448
480 548 456
496 594 476
512 606 470
528 656 480
544 606 498
560 652 512
576 662 508
592 716 522
608 664 538
624 710 552
640 720 516
656 772 526
672 722 544
688 768 556
704 778 556
720 832 568
736 780 584
752 826 600
768 836 560
784 888 572
800 838 588
816 884 604
832 894 598
848 946 612
864 896 628
880 942 644
896 952 608
912 1004 616
928 954 634
944 1000 646
960 1008 646
976 1062 658
992 1012 674
1008 1058 690

Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
Signed-off-by: Samuel Neves <sneves@xxxxxxxxx>
Co-developed-by: Samuel Neves <sneves@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: x86@xxxxxxxxxx
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: kernel-hardening@xxxxxxxxxxxxxxxxxx
Cc: linux-crypto@xxxxxxxxxxxxxxx
---
lib/zinc/Makefile | 1 +
lib/zinc/poly1305/poly1305-x86_64-glue.c | 154 ++
...-x86_64-cryptogams.S => poly1305-x86_64.S} | 2459 ++++++-----------
lib/zinc/poly1305/poly1305.c | 4 +
4 files changed, 1002 insertions(+), 1616 deletions(-)
create mode 100644 lib/zinc/poly1305/poly1305-x86_64-glue.c
rename lib/zinc/poly1305/{poly1305-x86_64-cryptogams.S => poly1305-x86_64.S} (58%)

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 6fc9626c55fa..a8943d960b6a 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -11,4 +11,5 @@ AFLAGS_chacha20-mips.o += -O2 # This is required to fill the branch delay slots
obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o

zinc_poly1305-y := poly1305/poly1305.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o
diff --git a/lib/zinc/poly1305/poly1305-x86_64-glue.c b/lib/zinc/poly1305/poly1305-x86_64-glue.c
new file mode 100644
index 000000000000..ccf5f1952503
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-glue.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+asmlinkage void poly1305_init_x86_64(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+
+static bool poly1305_use_avx __ro_after_init;
+static bool poly1305_use_avx2 __ro_after_init;
+static bool poly1305_use_avx512 __ro_after_init;
+static bool *const poly1305_nobs[] __initconst = {
+ &poly1305_use_avx, &poly1305_use_avx2, &poly1305_use_avx512 };
+
+static void __init poly1305_fpu_init(void)
+{
+ poly1305_use_avx =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx2 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_x86_64(ctx, key);
+ return true;
+}
+
+struct poly1305_arch_internal {
+ union {
+ struct {
+ u32 h[5];
+ u32 is_base2_26;
+ };
+ u64 hs[3];
+ };
+ u64 r[2];
+ u64 pad;
+ struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway.
+ */
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ size_t len, const u32 padbit,
+ simd_context_t *simd_context)
+{
+ struct poly1305_arch_internal *state = ctx;
+
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+ PAGE_SIZE % POLY1305_BLOCK_SIZE);
+
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !poly1305_use_avx ||
+ (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+ !simd_use(simd_context)) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return true;
+ }
+
+ for (;;) {
+ const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+ if (IS_ENABLED(CONFIG_AS_AVX512) && poly1305_use_avx512)
+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ else if (IS_ENABLED(CONFIG_AS_AVX2) && poly1305_use_avx2)
+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ else
+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ len -= bytes;
+ if (!len)
+ break;
+ inp += bytes;
+ simd_relax(simd_context);
+ }
+
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t *simd_context)
+{
+ struct poly1305_arch_internal *state = ctx;
+
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !poly1305_use_avx ||
+ !state->is_base2_26 || !simd_use(simd_context)) {
+ convert_to_base2_64(ctx);
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ } else
+ poly1305_emit_avx(ctx, mac, nonce);
+ return true;
+}
diff --git a/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S b/lib/zinc/poly1305/poly1305-x86_64.S
similarity index 58%
rename from lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
rename to lib/zinc/poly1305/poly1305-x86_64.S
index ed634757354b..3c3f2b4d880b 100644
--- a/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-x86_64.S
@@ -1,22 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2017 Samuel Neves <sneves@xxxxxxxxx>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/

-.text
-
+#include <linux/linkage.h>

+.section .rodata.cst192.Lconst, "aM", @progbits, 192
+.align 64
+.Lconst:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.long 2,2,2,3,2,0,2,1
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7

-.globl poly1305_init
-.hidden poly1305_init
-.globl poly1305_blocks
-.hidden poly1305_blocks
-.globl poly1305_emit
-.hidden poly1305_emit
+.text

-.type poly1305_init,@function
.align 32
-poly1305_init:
+ENTRY(poly1305_init_x86_64)
xorq %rax,%rax
movq %rax,0(%rdi)
movq %rax,8(%rdi)
@@ -25,61 +30,30 @@ poly1305_init:
cmpq $0,%rsi
je .Lno_key

- leaq poly1305_blocks(%rip),%r10
- leaq poly1305_emit(%rip),%r11
- movq OPENSSL_ia32cap_P+4(%rip),%r9
- leaq poly1305_blocks_avx(%rip),%rax
- leaq poly1305_emit_avx(%rip),%rcx
- btq $28,%r9
- cmovcq %rax,%r10
- cmovcq %rcx,%r11
- leaq poly1305_blocks_avx2(%rip),%rax
- btq $37,%r9
- cmovcq %rax,%r10
- movq $2149646336,%rax
- shrq $32,%r9
- andq %rax,%r9
- cmpq %rax,%r9
- je .Linit_base2_44
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
andq 8(%rsi),%rcx
movq %rax,24(%rdi)
movq %rcx,32(%rdi)
- movq %r10,0(%rdx)
- movq %r11,8(%rdx)
movl $1,%eax
.Lno_key:
- .byte 0xf3,0xc3
-.size poly1305_init,.-poly1305_init
+ ret
+ENDPROC(poly1305_init_x86_64)

-.type poly1305_blocks,@function
.align 32
-poly1305_blocks:
-.cfi_startproc
+ENTRY(poly1305_blocks_x86_64)
.Lblocks:
shrq $4,%rdx
jz .Lno_data

pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lblocks_body:

movq %rdx,%r15
@@ -89,7 +63,7 @@ poly1305_blocks:

movq 0(%rdi),%r14
movq 8(%rdi),%rbx
- movq 16(%rdi),%rbp
+ movq 16(%rdi),%r10

movq %r13,%r12
shrq $2,%r13
@@ -99,14 +73,15 @@ poly1305_blocks:

.align 32
.Loop:
+
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
mulq %r14
movq %rax,%r9
movq %r11,%rax
- movq %rdx,%r10
+ movq %rdx,%rdi

mulq %r14
movq %rax,%r14
@@ -116,62 +91,55 @@ poly1305_blocks:
mulq %rbx
addq %rax,%r9
movq %r13,%rax
- adcq %rdx,%r10
+ adcq %rdx,%rdi

mulq %rbx
- movq %rbp,%rbx
+ movq %r10,%rbx
addq %rax,%r14
adcq %rdx,%r8

imulq %r13,%rbx
addq %rbx,%r9
movq %r8,%rbx
- adcq $0,%r10
+ adcq $0,%rdi

- imulq %r11,%rbp
+ imulq %r11,%r10
addq %r9,%rbx
movq $-4,%rax
- adcq %rbp,%r10
+ adcq %r10,%rdi

- andq %r10,%rax
- movq %r10,%rbp
- shrq $2,%r10
- andq $3,%rbp
- addq %r10,%rax
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
addq %rax,%r14
adcq $0,%rbx
- adcq $0,%rbp
+ adcq $0,%r10
+
movq %r12,%rax
decq %r15
jnz .Loop

+ movq 0(%rsp),%rdi
+
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
- movq %rbp,16(%rdi)
-
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq %r10,16(%rdi)
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
.Lno_data:
.Lblocks_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks,.-poly1305_blocks
+ ret
+ENDPROC(poly1305_blocks_x86_64)

-.type poly1305_emit,@function
.align 32
-poly1305_emit:
+ENTRY(poly1305_emit_x86_64)
.Lemit:
movq 0(%rdi),%r8
movq 8(%rdi),%r9
@@ -191,15 +159,14 @@ poly1305_emit:
movq %rax,0(%rsi)
movq %rcx,8(%rsi)

- .byte 0xf3,0xc3
-.size poly1305_emit,.-poly1305_emit
-.type __poly1305_block,@function
-.align 32
-__poly1305_block:
+ ret
+ENDPROC(poly1305_emit_x86_64)
+
+.macro __poly1305_block
mulq %r14
movq %rax,%r9
movq %r11,%rax
- movq %rdx,%r10
+ movq %rdx,%rdi

mulq %r14
movq %rax,%r14
@@ -209,45 +176,44 @@ __poly1305_block:
mulq %rbx
addq %rax,%r9
movq %r13,%rax
- adcq %rdx,%r10
+ adcq %rdx,%rdi

mulq %rbx
- movq %rbp,%rbx
+ movq %r10,%rbx
addq %rax,%r14
adcq %rdx,%r8

imulq %r13,%rbx
addq %rbx,%r9
movq %r8,%rbx
- adcq $0,%r10
+ adcq $0,%rdi

- imulq %r11,%rbp
+ imulq %r11,%r10
addq %r9,%rbx
movq $-4,%rax
- adcq %rbp,%r10
+ adcq %r10,%rdi

- andq %r10,%rax
- movq %r10,%rbp
- shrq $2,%r10
- andq $3,%rbp
- addq %r10,%rax
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
addq %rax,%r14
adcq $0,%rbx
- adcq $0,%rbp
- .byte 0xf3,0xc3
-.size __poly1305_block,.-__poly1305_block
+ adcq $0,%r10
+.endm

-.type __poly1305_init_avx,@function
-.align 32
-__poly1305_init_avx:
+.macro __poly1305_init_avx
movq %r11,%r14
movq %r12,%rbx
- xorq %rbp,%rbp
+ xorq %r10,%r10

leaq 48+64(%rdi),%rdi

movq %r12,%rax
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi

movl $0x3ffffff,%eax
movl $0x3ffffff,%edx
@@ -305,7 +271,7 @@ __poly1305_init_avx:
movl %edx,36(%rdi)
shrq $26,%r9

- movq %rbp,%rax
+ movq %r10,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,48(%rdi)
@@ -316,7 +282,9 @@ __poly1305_init_avx:
movl %r9d,68(%rdi)

movq %r12,%rax
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi

movl $0x3ffffff,%eax
movq %r14,%r8
@@ -348,7 +316,7 @@ __poly1305_init_avx:
shrq $26,%r8
movl %edx,44(%rdi)

- movq %rbp,%rax
+ movq %r10,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,60(%rdi)
@@ -356,7 +324,9 @@ __poly1305_init_avx:
movl %r8d,76(%rdi)

movq %r12,%rax
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi

movl $0x3ffffff,%eax
movq %r14,%r8
@@ -388,7 +358,7 @@ __poly1305_init_avx:
shrq $26,%r8
movl %edx,40(%rdi)

- movq %rbp,%rax
+ movq %r10,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,56(%rdi)
@@ -396,13 +366,12 @@ __poly1305_init_avx:
movl %r8d,72(%rdi)

leaq -48-64(%rdi),%rdi
- .byte 0xf3,0xc3
-.size __poly1305_init_avx,.-__poly1305_init_avx
+.endm

-.type poly1305_blocks_avx,@function
+#ifdef CONFIG_AS_AVX
.align 32
-poly1305_blocks_avx:
-.cfi_startproc
+ENTRY(poly1305_blocks_avx)
+
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae .Lblocks_avx
@@ -422,30 +391,19 @@ poly1305_blocks_avx:
jz .Leven_avx

pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lblocks_avx_body:

movq %rdx,%r15

movq 0(%rdi),%r8
movq 8(%rdi),%r9
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d

movq 24(%rdi),%r11
movq 32(%rdi),%r13
@@ -465,21 +423,21 @@ poly1305_blocks_avx:
addq %r12,%r14
adcq %r9,%rbx

- movq %rbp,%r8
+ movq %r10,%r8
shlq $40,%r8
- shrq $24,%rbp
+ shrq $24,%r10
addq %r8,%rbx
- adcq $0,%rbp
+ adcq $0,%r10

movq $-4,%r9
- movq %rbp,%r8
- andq %rbp,%r9
+ movq %r10,%r8
+ andq %r10,%r9
shrq $2,%r8
- andq $3,%rbp
+ andq $3,%r10
addq %r9,%r8
addq %r8,%r14
adcq $0,%rbx
- adcq $0,%rbp
+ adcq $0,%r10

movq %r13,%r12
movq %r13,%rax
@@ -489,9 +447,11 @@ poly1305_blocks_avx:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10

- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi

testq %rcx,%rcx
jz .Lstore_base2_64_avx
@@ -508,11 +468,11 @@ poly1305_blocks_avx:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r11,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r12
andq $0x3ffffff,%rbx
- orq %r12,%rbp
+ orq %r12,%r10

subq $16,%r15
jz .Lstore_base2_26_avx
@@ -521,14 +481,14 @@ poly1305_blocks_avx:
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
jmp .Lproceed_avx

.align 32
.Lstore_base2_64_avx:
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
- movq %rbp,16(%rdi)
+ movq %r10,16(%rdi)
jmp .Ldone_avx

.align 16
@@ -537,49 +497,30 @@ poly1305_blocks_avx:
movl %edx,4(%rdi)
movl %r14d,8(%rdi)
movl %ebx,12(%rdi)
- movl %ebp,16(%rdi)
+ movl %r10d,16(%rdi)
.align 16
.Ldone_avx:
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lno_data_avx:
.Lblocks_avx_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
+ ret

.align 32
.Lbase2_64_avx:
-.cfi_startproc
+
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lbase2_64_avx_body:

movq %rdx,%r15
@@ -589,7 +530,7 @@ poly1305_blocks_avx:

movq 0(%rdi),%r14
movq 8(%rdi),%rbx
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d

movq %r13,%r12
movq %r13,%rax
@@ -602,10 +543,12 @@ poly1305_blocks_avx:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
subq $16,%r15

- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi

.Linit_avx:

@@ -620,46 +563,38 @@ poly1305_blocks_avx:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r8,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r9
andq $0x3ffffff,%rbx
- orq %r9,%rbp
+ orq %r9,%r10

vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
movl $1,20(%rdi)

- call __poly1305_init_avx
+ __poly1305_init_avx

.Lproceed_avx:
movq %r15,%rdx

- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lbase2_64_avx_epilogue:
jmp .Ldo_avx
-.cfi_endproc
+

.align 32
.Leven_avx:
-.cfi_startproc
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
@@ -667,8 +602,10 @@ poly1305_blocks_avx:
vmovd 16(%rdi),%xmm4

.Ldo_avx:
+ leaq 8(%rsp),%r10
+ andq $-32,%rsp
+ subq $8,%rsp
leaq -88(%rsp),%r11
-.cfi_def_cfa %r11,0x60
subq $0x178,%rsp
subq $64,%rdx
leaq -32(%rsi),%rax
@@ -678,8 +615,6 @@ poly1305_blocks_avx:
leaq 112(%rdi),%rdi
leaq .Lconst(%rip),%rcx

-
-
vmovdqu 32(%rsi),%xmm5
vmovdqu 48(%rsi),%xmm6
vmovdqa 64(%rcx),%xmm15
@@ -754,25 +689,6 @@ poly1305_blocks_avx:
.align 32
.Loop_avx:

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
vpmuludq %xmm5,%xmm14,%xmm10
vpmuludq %xmm6,%xmm14,%xmm11
vmovdqa %xmm2,32(%r11)
@@ -866,15 +782,6 @@ poly1305_blocks_avx:
subq $64,%rdx
cmovcq %rax,%rsi

-
-
-
-
-
-
-
-
-
vpmuludq %xmm0,%xmm9,%xmm5
vpmuludq %xmm1,%xmm9,%xmm6
vpaddq %xmm5,%xmm10,%xmm10
@@ -957,10 +864,6 @@ poly1305_blocks_avx:
vpand %xmm15,%xmm8,%xmm8
vpor 32(%rcx),%xmm9,%xmm9

-
-
-
-
vpsrlq $26,%xmm3,%xmm13
vpand %xmm15,%xmm3,%xmm3
vpaddq %xmm13,%xmm4,%xmm4
@@ -995,9 +898,6 @@ poly1305_blocks_avx:
ja .Loop_avx

.Lskip_loop_avx:
-
-
-
vpshufd $0x10,%xmm14,%xmm14
addq $32,%rdx
jnz .Long_tail_avx
@@ -1015,12 +915,6 @@ poly1305_blocks_avx:
vmovdqa %xmm3,48(%r11)
vmovdqa %xmm4,64(%r11)

-
-
-
-
-
-
vpmuludq %xmm7,%xmm14,%xmm12
vpmuludq %xmm5,%xmm14,%xmm10
vpshufd $0x10,-48(%rdi),%xmm2
@@ -1107,9 +1001,6 @@ poly1305_blocks_avx:
vpaddq 48(%r11),%xmm3,%xmm3
vpaddq 64(%r11),%xmm4,%xmm4

-
-
-
vpmuludq %xmm0,%xmm9,%xmm5
vpaddq %xmm5,%xmm10,%xmm10
vpmuludq %xmm1,%xmm9,%xmm6
@@ -1175,8 +1066,6 @@ poly1305_blocks_avx:

.Lshort_tail_avx:

-
-
vpsrldq $8,%xmm14,%xmm9
vpsrldq $8,%xmm13,%xmm8
vpsrldq $8,%xmm11,%xmm6
@@ -1188,9 +1077,6 @@ poly1305_blocks_avx:
vpaddq %xmm6,%xmm11,%xmm11
vpaddq %xmm7,%xmm12,%xmm12

-
-
-
vpsrlq $26,%xmm13,%xmm3
vpand %xmm15,%xmm13,%xmm13
vpaddq %xmm3,%xmm14,%xmm14
@@ -1227,16 +1113,14 @@ poly1305_blocks_avx:
vmovd %xmm12,-104(%rdi)
vmovd %xmm13,-100(%rdi)
vmovd %xmm14,-96(%rdi)
- leaq 88(%r11),%rsp
-.cfi_def_cfa %rsp,8
+ leaq -8(%r10),%rsp
+
vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks_avx,.-poly1305_blocks_avx
+ ret
+ENDPROC(poly1305_blocks_avx)

-.type poly1305_emit_avx,@function
.align 32
-poly1305_emit_avx:
+ENTRY(poly1305_emit_avx)
cmpl $0,20(%rdi)
je .Lemit

@@ -1286,12 +1170,14 @@ poly1305_emit_avx:
movq %rax,0(%rsi)
movq %rcx,8(%rsi)

- .byte 0xf3,0xc3
-.size poly1305_emit_avx,.-poly1305_emit_avx
-.type poly1305_blocks_avx2,@function
+ ret
+ENDPROC(poly1305_emit_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
.align 32
-poly1305_blocks_avx2:
-.cfi_startproc
+ENTRY(poly1305_blocks_avx2)
+
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae .Lblocks_avx2
@@ -1311,30 +1197,19 @@ poly1305_blocks_avx2:
jz .Leven_avx2

pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lblocks_avx2_body:

movq %rdx,%r15

movq 0(%rdi),%r8
movq 8(%rdi),%r9
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d

movq 24(%rdi),%r11
movq 32(%rdi),%r13
@@ -1354,21 +1229,21 @@ poly1305_blocks_avx2:
addq %r12,%r14
adcq %r9,%rbx

- movq %rbp,%r8
+ movq %r10,%r8
shlq $40,%r8
- shrq $24,%rbp
+ shrq $24,%r10
addq %r8,%rbx
- adcq $0,%rbp
+ adcq $0,%r10

movq $-4,%r9
- movq %rbp,%r8
- andq %rbp,%r9
+ movq %r10,%r8
+ andq %r10,%r9
shrq $2,%r8
- andq $3,%rbp
+ andq $3,%r10
addq %r9,%r8
addq %r8,%r14
adcq $0,%rbx
- adcq $0,%rbp
+ adcq $0,%r10

movq %r13,%r12
movq %r13,%rax
@@ -1379,10 +1254,12 @@ poly1305_blocks_avx2:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
subq $16,%r15

- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movq %r12,%rax

testq $63,%r15
@@ -1403,11 +1280,11 @@ poly1305_blocks_avx2:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r11,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r12
andq $0x3ffffff,%rbx
- orq %r12,%rbp
+ orq %r12,%r10

testq %r15,%r15
jz .Lstore_base2_26_avx2
@@ -1416,14 +1293,14 @@ poly1305_blocks_avx2:
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
jmp .Lproceed_avx2

.align 32
.Lstore_base2_64_avx2:
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
- movq %rbp,16(%rdi)
+ movq %r10,16(%rdi)
jmp .Ldone_avx2

.align 16
@@ -1432,49 +1309,32 @@ poly1305_blocks_avx2:
movl %edx,4(%rdi)
movl %r14d,8(%rdi)
movl %ebx,12(%rdi)
- movl %ebp,16(%rdi)
+ movl %r10d,16(%rdi)
.align 16
.Ldone_avx2:
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lno_data_avx2:
.Lblocks_avx2_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
+ ret
+

.align 32
.Lbase2_64_avx2:
-.cfi_startproc
+
+
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lbase2_64_avx2_body:

movq %rdx,%r15
@@ -1484,7 +1344,7 @@ poly1305_blocks_avx2:

movq 0(%rdi),%r14
movq 8(%rdi),%rbx
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d

movq %r13,%r12
movq %r13,%rax
@@ -1498,10 +1358,12 @@ poly1305_blocks_avx2:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
subq $16,%r15

- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movq %r12,%rax

testq $63,%r15
@@ -1520,49 +1382,39 @@ poly1305_blocks_avx2:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r8,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r9
andq $0x3ffffff,%rbx
- orq %r9,%rbp
+ orq %r9,%r10

vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
movl $1,20(%rdi)

- call __poly1305_init_avx
+ __poly1305_init_avx

.Lproceed_avx2:
movq %r15,%rdx
- movl OPENSSL_ia32cap_P+8(%rip),%r10d
- movl $3221291008,%r11d
-
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lbase2_64_avx2_epilogue:
jmp .Ldo_avx2
-.cfi_endproc
+

.align 32
.Leven_avx2:
-.cfi_startproc
- movl OPENSSL_ia32cap_P+8(%rip),%r10d
+
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
@@ -1570,14 +1422,7 @@ poly1305_blocks_avx2:
vmovd 16(%rdi),%xmm4

.Ldo_avx2:
- cmpq $512,%rdx
- jb .Lskip_avx512
- andl %r11d,%r10d
- testl $65536,%r10d
- jnz .Lblocks_avx512
-.Lskip_avx512:
- leaq -8(%rsp),%r11
-.cfi_def_cfa %r11,16
+ leaq 8(%rsp),%r10
subq $0x128,%rsp
leaq .Lconst(%rip),%rcx
leaq 48+64(%rdi),%rdi
@@ -1647,13 +1492,6 @@ poly1305_blocks_avx2:
.align 32
.Loop_avx2:

-
-
-
-
-
-
-
vpaddq %ymm0,%ymm7,%ymm0
vmovdqa 0(%rsp),%ymm7
vpaddq %ymm1,%ymm8,%ymm1
@@ -1664,21 +1502,6 @@ poly1305_blocks_avx2:
vmovdqa 48(%rax),%ymm10
vmovdqa 112(%rax),%ymm5

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
vpmuludq %ymm2,%ymm7,%ymm13
vpmuludq %ymm2,%ymm8,%ymm14
vpmuludq %ymm2,%ymm9,%ymm15
@@ -1743,9 +1566,6 @@ poly1305_blocks_avx2:
vpaddq %ymm4,%ymm15,%ymm4
vpaddq %ymm0,%ymm11,%ymm0

-
-
-
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
@@ -1798,12 +1618,6 @@ poly1305_blocks_avx2:
.byte 0x66,0x90
.Ltail_avx2:

-
-
-
-
-
-
vpaddq %ymm0,%ymm7,%ymm0
vmovdqu 4(%rsp),%ymm7
vpaddq %ymm1,%ymm8,%ymm1
@@ -1868,9 +1682,6 @@ poly1305_blocks_avx2:
vpaddq %ymm4,%ymm15,%ymm4
vpaddq %ymm0,%ymm11,%ymm0

-
-
-
vpsrldq $8,%ymm12,%ymm8
vpsrldq $8,%ymm2,%ymm9
vpsrldq $8,%ymm3,%ymm10
@@ -1893,9 +1704,6 @@ poly1305_blocks_avx2:
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2

-
-
-
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
@@ -1932,110 +1740,673 @@ poly1305_blocks_avx2:
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
- leaq 8(%r11),%rsp
-.cfi_def_cfa %rsp,8
+ leaq -8(%r10),%rsp
+
vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
-.type poly1305_blocks_avx512,@function
+ ret
+
+ENDPROC(poly1305_blocks_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
.align 32
-poly1305_blocks_avx512:
-.cfi_startproc
-.Lblocks_avx512:
- movl $15,%eax
- kmovw %eax,%k2
- leaq -8(%rsp),%r11
-.cfi_def_cfa %r11,16
- subq $0x128,%rsp
- leaq .Lconst(%rip),%rcx
- leaq 48+64(%rdi),%rdi
- vmovdqa 96(%rcx),%ymm9
+ENTRY(poly1305_blocks_avx512)

+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2_512
+ testl %r8d,%r8d
+ jz .Lblocks

- vmovdqu -64(%rdi),%xmm11
- andq $-512,%rsp
- vmovdqu -48(%rdi),%xmm12
- movq $0x20,%rax
- vmovdqu -32(%rdi),%xmm7
- vmovdqu -16(%rdi),%xmm13
- vmovdqu 0(%rdi),%xmm8
- vmovdqu 16(%rdi),%xmm14
- vmovdqu 32(%rdi),%xmm10
- vmovdqu 48(%rdi),%xmm15
- vmovdqu 64(%rdi),%xmm6
- vpermd %zmm11,%zmm9,%zmm16
- vpbroadcastq 64(%rcx),%zmm5
- vpermd %zmm12,%zmm9,%zmm17
- vpermd %zmm7,%zmm9,%zmm21
- vpermd %zmm13,%zmm9,%zmm18
- vmovdqa64 %zmm16,0(%rsp){%k2}
- vpsrlq $32,%zmm16,%zmm7
- vpermd %zmm8,%zmm9,%zmm22
- vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
- vpsrlq $32,%zmm17,%zmm8
- vpermd %zmm14,%zmm9,%zmm19
- vmovdqa64 %zmm21,64(%rsp){%k2}
- vpermd %zmm10,%zmm9,%zmm23
- vpermd %zmm15,%zmm9,%zmm20
- vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
- vpermd %zmm6,%zmm9,%zmm24
- vmovdqa64 %zmm22,128(%rsp){%k2}
- vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
- vmovdqa64 %zmm23,192(%rsp){%k2}
- vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
- vmovdqa64 %zmm24,256(%rsp){%k2}
+.Lblocks_avx2_512:
+ andq $-16,%rdx
+ jz .Lno_data_avx2_512

+ vzeroupper

+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2_512

+ testq $63,%rdx
+ jz .Leven_avx2_512

+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi

+.Lblocks_avx2_body_512:

+ movq %rdx,%r15

+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d

+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13


- vpmuludq %zmm7,%zmm16,%zmm11
- vpmuludq %zmm7,%zmm17,%zmm12
- vpmuludq %zmm7,%zmm18,%zmm13
- vpmuludq %zmm7,%zmm19,%zmm14
- vpmuludq %zmm7,%zmm20,%zmm15
- vpsrlq $32,%zmm18,%zmm9
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9

- vpmuludq %zmm8,%zmm24,%zmm25
- vpmuludq %zmm8,%zmm16,%zmm26
- vpmuludq %zmm8,%zmm17,%zmm27
- vpmuludq %zmm8,%zmm18,%zmm28
- vpmuludq %zmm8,%zmm19,%zmm29
- vpsrlq $32,%zmm19,%zmm10
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx

- vpmuludq %zmm9,%zmm23,%zmm25
- vpmuludq %zmm9,%zmm24,%zmm26
- vpmuludq %zmm9,%zmm17,%zmm28
- vpmuludq %zmm9,%zmm18,%zmm29
- vpmuludq %zmm9,%zmm16,%zmm27
- vpsrlq $32,%zmm20,%zmm6
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm27,%zmm13,%zmm13
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10

- vpmuludq %zmm10,%zmm22,%zmm25
- vpmuludq %zmm10,%zmm16,%zmm28
- vpmuludq %zmm10,%zmm17,%zmm29
- vpmuludq %zmm10,%zmm23,%zmm26
- vpmuludq %zmm10,%zmm24,%zmm27
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2_512
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2_512
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2_512
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx2_512
+
+.align 32
+.Lstore_base2_64_avx2_512:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx2_512
+
+.align 16
+.Lstore_base2_26_avx2_512:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx2_512:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx2_512:
+.Lblocks_avx2_epilogue_512:
+ ret
+
+
+.align 32
+.Lbase2_64_avx2_512:
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx2_body_512:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2_512
+
+.Lbase2_64_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2_512
+
+.Linit_avx2_512:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx2_512:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue_512:
+ jmp .Ldo_avx2_512
+
+
+.align 32
+.Leven_avx2_512:
+
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2_512:
+ cmpq $512,%rdx
+ jae .Lblocks_avx512
+.Lskip_avx512:
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2_512
+ jmp .Loop_avx2_512
+
+.align 32
+.Loop_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2_512
+
+.byte 0x66,0x90
+.Ltail_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+
+.Lblocks_avx512:
+
+ movl $15,%eax
+ kmovw %eax,%k2
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm9
+
+ vmovdqu32 -64(%rdi),%zmm16{%k2}{z}
+ andq $-512,%rsp
+ vmovdqu32 -48(%rdi),%zmm17{%k2}{z}
+ movq $0x20,%rax
+ vmovdqu32 -32(%rdi),%zmm21{%k2}{z}
+ vmovdqu32 -16(%rdi),%zmm18{%k2}{z}
+ vmovdqu32 0(%rdi),%zmm22{%k2}{z}
+ vmovdqu32 16(%rdi),%zmm19{%k2}{z}
+ vmovdqu32 32(%rdi),%zmm23{%k2}{z}
+ vmovdqu32 48(%rdi),%zmm20{%k2}{z}
+ vmovdqu32 64(%rdi),%zmm24{%k2}{z}
+ vpermd %zmm16,%zmm9,%zmm16
+ vpbroadcastq 64(%rcx),%zmm5
+ vpermd %zmm17,%zmm9,%zmm17
+ vpermd %zmm21,%zmm9,%zmm21
+ vpermd %zmm18,%zmm9,%zmm18
+ vmovdqa64 %zmm16,0(%rsp){%k2}
+ vpsrlq $32,%zmm16,%zmm7
+ vpermd %zmm22,%zmm9,%zmm22
+ vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
+ vpsrlq $32,%zmm17,%zmm8
+ vpermd %zmm19,%zmm9,%zmm19
+ vmovdqa64 %zmm21,64(%rsp){%k2}
+ vpermd %zmm23,%zmm9,%zmm23
+ vpermd %zmm20,%zmm9,%zmm20
+ vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
+ vpermd %zmm24,%zmm9,%zmm24
+ vmovdqa64 %zmm22,128(%rsp){%k2}
+ vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm23,192(%rsp){%k2}
+ vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm24,256(%rsp){%k2}
+
+ vpmuludq %zmm7,%zmm16,%zmm11
+ vpmuludq %zmm7,%zmm17,%zmm12
+ vpmuludq %zmm7,%zmm18,%zmm13
+ vpmuludq %zmm7,%zmm19,%zmm14
+ vpmuludq %zmm7,%zmm20,%zmm15
+ vpsrlq $32,%zmm18,%zmm9
+
+ vpmuludq %zmm8,%zmm24,%zmm25
+ vpmuludq %zmm8,%zmm16,%zmm26
+ vpmuludq %zmm8,%zmm17,%zmm27
+ vpmuludq %zmm8,%zmm18,%zmm28
+ vpmuludq %zmm8,%zmm19,%zmm29
+ vpsrlq $32,%zmm19,%zmm10
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+
+ vpmuludq %zmm9,%zmm23,%zmm25
+ vpmuludq %zmm9,%zmm24,%zmm26
+ vpmuludq %zmm9,%zmm17,%zmm28
+ vpmuludq %zmm9,%zmm18,%zmm29
+ vpmuludq %zmm9,%zmm16,%zmm27
+ vpsrlq $32,%zmm20,%zmm6
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm10,%zmm22,%zmm25
+ vpmuludq %zmm10,%zmm16,%zmm28
+ vpmuludq %zmm10,%zmm17,%zmm29
+ vpmuludq %zmm10,%zmm23,%zmm26
+ vpmuludq %zmm10,%zmm24,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13

vpmuludq %zmm6,%zmm24,%zmm28
vpmuludq %zmm6,%zmm16,%zmm29
@@ -2048,15 +2419,10 @@ poly1305_blocks_avx512:
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13

-
-
vmovdqu64 0(%rsi),%zmm10
vmovdqu64 64(%rsi),%zmm6
leaq 128(%rsi),%rsi

-
-
-
vpsrlq $26,%zmm14,%zmm28
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15
@@ -2088,18 +2454,9 @@ poly1305_blocks_avx512:
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15

-
-
-
-
vpunpcklqdq %zmm6,%zmm10,%zmm7
vpunpckhqdq %zmm6,%zmm10,%zmm6

-
-
-
-
-
vmovdqa32 128(%rcx),%zmm25
movl $0x7777,%eax
kmovw %eax,%k1
@@ -2136,9 +2493,6 @@ poly1305_blocks_avx512:
vpandq %zmm5,%zmm9,%zmm9
vpandq %zmm5,%zmm7,%zmm7

-
-
-
vpaddq %zmm2,%zmm9,%zmm2
subq $192,%rdx
jbe .Ltail_avx512
@@ -2147,33 +2501,6 @@ poly1305_blocks_avx512:
.align 32
.Loop_avx512:

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
vpmuludq %zmm2,%zmm17,%zmm14
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm18,%zmm15
@@ -2238,9 +2565,6 @@ poly1305_blocks_avx512:
vpaddq %zmm26,%zmm12,%zmm1
vpaddq %zmm27,%zmm13,%zmm2

-
-
-
vpsrlq $52,%zmm7,%zmm9
vpsllq $12,%zmm6,%zmm10

@@ -2288,18 +2612,11 @@ poly1305_blocks_avx512:

vpandq %zmm5,%zmm7,%zmm7

-
-
-
subq $128,%rdx
ja .Loop_avx512

.Ltail_avx512:

-
-
-
-
vpsrlq $32,%zmm16,%zmm16
vpsrlq $32,%zmm17,%zmm17
vpsrlq $32,%zmm18,%zmm18
@@ -2310,11 +2627,8 @@ poly1305_blocks_avx512:
vpsrlq $32,%zmm21,%zmm21
vpsrlq $32,%zmm22,%zmm22

-
-
leaq (%rsi,%rdx,1),%rsi

-
vpaddq %zmm0,%zmm7,%zmm0

vpmuludq %zmm2,%zmm17,%zmm14
@@ -2378,9 +2692,6 @@ poly1305_blocks_avx512:
vpaddq %zmm26,%zmm12,%zmm1
vpaddq %zmm27,%zmm13,%zmm2

-
-
-
movl $1,%eax
vpermq $0xb1,%zmm3,%zmm14
vpermq $0xb1,%zmm15,%zmm4
@@ -2416,8 +2727,6 @@ poly1305_blocks_avx512:
vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}

-
-
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpsrldq $6,%ymm7,%ymm9
@@ -2466,7 +2775,7 @@ poly1305_blocks_avx512:

leaq 144(%rsp),%rax
addq $64,%rdx
- jnz .Ltail_avx2
+ jnz .Ltail_avx2_512

vpsubq %ymm9,%ymm2,%ymm2
vmovd %xmm0,-112(%rdi)
@@ -2475,1091 +2784,9 @@ poly1305_blocks_avx512:
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
vzeroall
- leaq 8(%r11),%rsp
-.cfi_def_cfa %rsp,8
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
-.type poly1305_init_base2_44,@function
-.align 32
-poly1305_init_base2_44:
- xorq %rax,%rax
- movq %rax,0(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
-
-.Linit_base2_44:
- leaq poly1305_blocks_vpmadd52(%rip),%r10
- leaq poly1305_emit_base2_44(%rip),%r11
-
- movq $0x0ffffffc0fffffff,%rax
- movq $0x0ffffffc0ffffffc,%rcx
- andq 0(%rsi),%rax
- movq $0x00000fffffffffff,%r8
- andq 8(%rsi),%rcx
- movq $0x00000fffffffffff,%r9
- andq %rax,%r8
- shrdq $44,%rcx,%rax
- movq %r8,40(%rdi)
- andq %r9,%rax
- shrq $24,%rcx
- movq %rax,48(%rdi)
- leaq (%rax,%rax,4),%rax
- movq %rcx,56(%rdi)
- shlq $2,%rax
- leaq (%rcx,%rcx,4),%rcx
- shlq $2,%rcx
- movq %rax,24(%rdi)
- movq %rcx,32(%rdi)
- movq $-1,64(%rdi)
- movq %r10,0(%rdx)
- movq %r11,8(%rdx)
- movl $1,%eax
- .byte 0xf3,0xc3
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-.type poly1305_blocks_vpmadd52,@function
-.align 32
-poly1305_blocks_vpmadd52:
- shrq $4,%rdx
- jz .Lno_data_vpmadd52
-
- shlq $40,%rcx
- movq 64(%rdi),%r8
-
-
-
-
-
-
- movq $3,%rax
- movq $1,%r10
- cmpq $4,%rdx
- cmovaeq %r10,%rax
- testq %r8,%r8
- cmovnsq %r10,%rax
-
- andq %rdx,%rax
- jz .Lblocks_vpmadd52_4x
-
- subq %rax,%rdx
- movl $7,%r10d
- movl $1,%r11d
- kmovw %r10d,%k7
- leaq .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq %rcx,%xmm21
- vmovdqa64 0(%r10),%ymm19
- vmovdqa64 32(%r10),%ymm20
- vpermq $0xcf,%ymm21,%ymm21
- vmovdqa64 64(%r10),%ymm22
-
- vmovdqu64 0(%rdi),%ymm16{%k7}{z}
- vmovdqu64 40(%rdi),%ymm3{%k7}{z}
- vmovdqu64 32(%rdi),%ymm4{%k7}{z}
- vmovdqu64 24(%rdi),%ymm5{%k7}{z}
-
- vmovdqa64 96(%r10),%ymm23
- vmovdqa64 128(%r10),%ymm24
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0(%rsi),%xmm18
- leaq 16(%rsi),%rsi
-
- vpermd %ymm18,%ymm19,%ymm18
- vpsrlvq %ymm20,%ymm18,%ymm18
- vpandq %ymm22,%ymm18,%ymm18
- vporq %ymm21,%ymm18,%ymm18
-
- vpaddq %ymm18,%ymm16,%ymm16
-
- vpermq $0,%ymm16,%ymm0{%k7}{z}
- vpermq $85,%ymm16,%ymm1{%k7}{z}
- vpermq $170,%ymm16,%ymm2{%k7}{z}
-
- vpxord %ymm16,%ymm16,%ymm16
- vpxord %ymm17,%ymm17,%ymm17
-
- vpmadd52luq %ymm3,%ymm0,%ymm16
- vpmadd52huq %ymm3,%ymm0,%ymm17
-
- vpmadd52luq %ymm4,%ymm1,%ymm16
- vpmadd52huq %ymm4,%ymm1,%ymm17
-
- vpmadd52luq %ymm5,%ymm2,%ymm16
- vpmadd52huq %ymm5,%ymm2,%ymm17
-
- vpsrlvq %ymm23,%ymm16,%ymm18
- vpsllvq %ymm24,%ymm17,%ymm17
- vpandq %ymm22,%ymm16,%ymm16
-
- vpaddq %ymm18,%ymm17,%ymm17
-
- vpermq $147,%ymm17,%ymm17
-
- vpaddq %ymm17,%ymm16,%ymm16
-
- vpsrlvq %ymm23,%ymm16,%ymm18
- vpandq %ymm22,%ymm16,%ymm16
-
- vpermq $147,%ymm18,%ymm18
-
- vpaddq %ymm18,%ymm16,%ymm16
-
- vpermq $147,%ymm16,%ymm18{%k1}{z}
-
- vpaddq %ymm18,%ymm16,%ymm16
- vpsllq $2,%ymm18,%ymm18
-
- vpaddq %ymm18,%ymm16,%ymm16
-
- decq %rax
- jnz .Loop_vpmadd52
-
- vmovdqu64 %ymm16,0(%rdi){%k7}
-
- testq %rdx,%rdx
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- .byte 0xf3,0xc3
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-.type poly1305_blocks_vpmadd52_4x,@function
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shrq $4,%rdx
- jz .Lno_data_vpmadd52_4x
-
- shlq $40,%rcx
- movq 64(%rdi),%r8
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq %rcx,%ymm31
-
- vmovdqa64 .Lx_mask44(%rip),%ymm28
- movl $5,%eax
- vmovdqa64 .Lx_mask42(%rip),%ymm29
- kmovw %eax,%k1
-
- testq %r8,%r8
- js .Linit_vpmadd52
-
- vmovq 0(%rdi),%xmm0
- vmovq 8(%rdi),%xmm1
- vmovq 16(%rdi),%xmm2
-
- testq $3,%rdx
- jnz .Lblocks_vpmadd52_2x_do
-
-.Lblocks_vpmadd52_4x_do:
- vpbroadcastq 64(%rdi),%ymm3
- vpbroadcastq 96(%rdi),%ymm4
- vpbroadcastq 128(%rdi),%ymm5
- vpbroadcastq 160(%rdi),%ymm16
-
-.Lblocks_vpmadd52_4x_key_loaded:
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm17,%ymm17
-
- testq $7,%rdx
- jz .Lblocks_vpmadd52_8x
-
- vmovdqu64 0(%rsi),%ymm26
- vmovdqu64 32(%rsi),%ymm27
- leaq 64(%rsi),%rsi
-
- vpunpcklqdq %ymm27,%ymm26,%ymm25
- vpunpckhqdq %ymm27,%ymm26,%ymm27
-
-
-
- vpsrlq $24,%ymm27,%ymm26
- vporq %ymm31,%ymm26,%ymm26
- vpaddq %ymm26,%ymm2,%ymm2
- vpandq %ymm28,%ymm25,%ymm24
- vpsrlq $44,%ymm25,%ymm25
- vpsllq $20,%ymm27,%ymm27
- vporq %ymm27,%ymm25,%ymm25
- vpandq %ymm28,%ymm25,%ymm25
-
- subq $4,%rdx
- jz .Ltail_vpmadd52_4x
- jmp .Loop_vpmadd52_4x
- ud2
-
-.align 32
-.Linit_vpmadd52:
- vmovq 24(%rdi),%xmm16
- vmovq 56(%rdi),%xmm2
- vmovq 32(%rdi),%xmm17
- vmovq 40(%rdi),%xmm3
- vmovq 48(%rdi),%xmm4
-
- vmovdqa %ymm3,%ymm0
- vmovdqa %ymm4,%ymm1
- vmovdqa %ymm2,%ymm5
-
- movl $2,%eax
-
-.Lmul_init_vpmadd52:
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm2,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm2,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm2,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm2,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm2,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm2,%ymm3,%ymm23
-
- vpmadd52luq %ymm0,%ymm3,%ymm18
- vpmadd52huq %ymm0,%ymm3,%ymm19
- vpmadd52luq %ymm0,%ymm4,%ymm20
- vpmadd52huq %ymm0,%ymm4,%ymm21
- vpmadd52luq %ymm0,%ymm5,%ymm22
- vpmadd52huq %ymm0,%ymm5,%ymm23
-
- vpmadd52luq %ymm1,%ymm17,%ymm18
- vpmadd52huq %ymm1,%ymm17,%ymm19
- vpmadd52luq %ymm1,%ymm3,%ymm20
- vpmadd52huq %ymm1,%ymm3,%ymm21
- vpmadd52luq %ymm1,%ymm4,%ymm22
- vpmadd52huq %ymm1,%ymm4,%ymm23
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
- decl %eax
- jz .Ldone_init_vpmadd52
-
- vpunpcklqdq %ymm4,%ymm1,%ymm4
- vpbroadcastq %xmm1,%xmm1
- vpunpcklqdq %ymm5,%ymm2,%ymm5
- vpbroadcastq %xmm2,%xmm2
- vpunpcklqdq %ymm3,%ymm0,%ymm3
- vpbroadcastq %xmm0,%xmm0
-
- vpsllq $2,%ymm4,%ymm16
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm4,%ymm16,%ymm16
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm16,%ymm16
- vpsllq $2,%ymm17,%ymm17
-
- jmp .Lmul_init_vpmadd52
- ud2
-
-.align 32
-.Ldone_init_vpmadd52:
- vinserti128 $1,%xmm4,%ymm1,%ymm4
- vinserti128 $1,%xmm5,%ymm2,%ymm5
- vinserti128 $1,%xmm3,%ymm0,%ymm3
-
- vpermq $216,%ymm4,%ymm4
- vpermq $216,%ymm5,%ymm5
- vpermq $216,%ymm3,%ymm3
-
- vpsllq $2,%ymm4,%ymm16
- vpaddq %ymm4,%ymm16,%ymm16
- vpsllq $2,%ymm16,%ymm16
-
- vmovq 0(%rdi),%xmm0
- vmovq 8(%rdi),%xmm1
- vmovq 16(%rdi),%xmm2
-
- testq $3,%rdx
- jnz .Ldone_init_vpmadd52_2x
-
- vmovdqu64 %ymm3,64(%rdi)
- vpbroadcastq %xmm3,%ymm3
- vmovdqu64 %ymm4,96(%rdi)
- vpbroadcastq %xmm4,%ymm4
- vmovdqu64 %ymm5,128(%rdi)
- vpbroadcastq %xmm5,%ymm5
- vmovdqu64 %ymm16,160(%rdi)
- vpbroadcastq %xmm16,%ymm16
-
- jmp .Lblocks_vpmadd52_4x_key_loaded
- ud2
-
-.align 32
-.Ldone_init_vpmadd52_2x:
- vmovdqu64 %ymm3,64(%rdi)
- vpsrldq $8,%ymm3,%ymm3
- vmovdqu64 %ymm4,96(%rdi)
- vpsrldq $8,%ymm4,%ymm4
- vmovdqu64 %ymm5,128(%rdi)
- vpsrldq $8,%ymm5,%ymm5
- vmovdqu64 %ymm16,160(%rdi)
- vpsrldq $8,%ymm16,%ymm16
- jmp .Lblocks_vpmadd52_2x_key_loaded
- ud2
-
-.align 32
-.Lblocks_vpmadd52_2x_do:
- vmovdqu64 128+8(%rdi),%ymm5{%k1}{z}
- vmovdqu64 160+8(%rdi),%ymm16{%k1}{z}
- vmovdqu64 64+8(%rdi),%ymm3{%k1}{z}
- vmovdqu64 96+8(%rdi),%ymm4{%k1}{z}
-
-.Lblocks_vpmadd52_2x_key_loaded:
- vmovdqu64 0(%rsi),%ymm26
- vpxorq %ymm27,%ymm27,%ymm27
- leaq 32(%rsi),%rsi
-
- vpunpcklqdq %ymm27,%ymm26,%ymm25
- vpunpckhqdq %ymm27,%ymm26,%ymm27
-
-
-
- vpsrlq $24,%ymm27,%ymm26
- vporq %ymm31,%ymm26,%ymm26
- vpaddq %ymm26,%ymm2,%ymm2
- vpandq %ymm28,%ymm25,%ymm24
- vpsrlq $44,%ymm25,%ymm25
- vpsllq $20,%ymm27,%ymm27
- vporq %ymm27,%ymm25,%ymm25
- vpandq %ymm28,%ymm25,%ymm25
-
- jmp .Ltail_vpmadd52_2x
- ud2
-
-.align 32
-.Loop_vpmadd52_4x:
-
- vpaddq %ymm24,%ymm0,%ymm0
- vpaddq %ymm25,%ymm1,%ymm1
-
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm2,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm2,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm2,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm2,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm2,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm2,%ymm3,%ymm23
-
- vmovdqu64 0(%rsi),%ymm26
- vmovdqu64 32(%rsi),%ymm27
- leaq 64(%rsi),%rsi
- vpmadd52luq %ymm0,%ymm3,%ymm18
- vpmadd52huq %ymm0,%ymm3,%ymm19
- vpmadd52luq %ymm0,%ymm4,%ymm20
- vpmadd52huq %ymm0,%ymm4,%ymm21
- vpmadd52luq %ymm0,%ymm5,%ymm22
- vpmadd52huq %ymm0,%ymm5,%ymm23
-
- vpunpcklqdq %ymm27,%ymm26,%ymm25
- vpunpckhqdq %ymm27,%ymm26,%ymm27
- vpmadd52luq %ymm1,%ymm17,%ymm18
- vpmadd52huq %ymm1,%ymm17,%ymm19
- vpmadd52luq %ymm1,%ymm3,%ymm20
- vpmadd52huq %ymm1,%ymm3,%ymm21
- vpmadd52luq %ymm1,%ymm4,%ymm22
- vpmadd52huq %ymm1,%ymm4,%ymm23
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpsrlq $24,%ymm27,%ymm26
- vporq %ymm31,%ymm26,%ymm26
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpandq %ymm28,%ymm25,%ymm24
- vpsrlq $44,%ymm25,%ymm25
- vpsllq $20,%ymm27,%ymm27
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm26,%ymm2,%ymm2
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vporq %ymm27,%ymm25,%ymm25
- vpandq %ymm28,%ymm25,%ymm25
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
- subq $4,%rdx
- jnz .Loop_vpmadd52_4x
-
-.Ltail_vpmadd52_4x:
- vmovdqu64 128(%rdi),%ymm5
- vmovdqu64 160(%rdi),%ymm16
- vmovdqu64 64(%rdi),%ymm3
- vmovdqu64 96(%rdi),%ymm4
-
-.Ltail_vpmadd52_2x:
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm17,%ymm17
-
-
- vpaddq %ymm24,%ymm0,%ymm0
- vpaddq %ymm25,%ymm1,%ymm1
-
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm2,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm2,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm2,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm2,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm2,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm2,%ymm3,%ymm23
-
- vpmadd52luq %ymm0,%ymm3,%ymm18
- vpmadd52huq %ymm0,%ymm3,%ymm19
- vpmadd52luq %ymm0,%ymm4,%ymm20
- vpmadd52huq %ymm0,%ymm4,%ymm21
- vpmadd52luq %ymm0,%ymm5,%ymm22
- vpmadd52huq %ymm0,%ymm5,%ymm23
-
- vpmadd52luq %ymm1,%ymm17,%ymm18
- vpmadd52huq %ymm1,%ymm17,%ymm19
- vpmadd52luq %ymm1,%ymm3,%ymm20
- vpmadd52huq %ymm1,%ymm3,%ymm21
- vpmadd52luq %ymm1,%ymm4,%ymm22
- vpmadd52huq %ymm1,%ymm4,%ymm23
-
-
-
-
- movl $1,%eax
- kmovw %eax,%k1
- vpsrldq $8,%ymm18,%ymm24
- vpsrldq $8,%ymm19,%ymm0
- vpsrldq $8,%ymm20,%ymm25
- vpsrldq $8,%ymm21,%ymm1
- vpaddq %ymm24,%ymm18,%ymm18
- vpaddq %ymm0,%ymm19,%ymm19
- vpsrldq $8,%ymm22,%ymm26
- vpsrldq $8,%ymm23,%ymm2
- vpaddq %ymm25,%ymm20,%ymm20
- vpaddq %ymm1,%ymm21,%ymm21
- vpermq $0x2,%ymm18,%ymm24
- vpermq $0x2,%ymm19,%ymm0
- vpaddq %ymm26,%ymm22,%ymm22
- vpaddq %ymm2,%ymm23,%ymm23
-
- vpermq $0x2,%ymm20,%ymm25
- vpermq $0x2,%ymm21,%ymm1
- vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
- vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
- vpermq $0x2,%ymm22,%ymm26
- vpermq $0x2,%ymm23,%ymm2
- vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
- vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
- vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
- vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
-
- subq $2,%rdx
- ja .Lblocks_vpmadd52_4x_do
-
- vmovq %xmm0,0(%rdi)
- vmovq %xmm1,8(%rdi)
- vmovq %xmm2,16(%rdi)
- vzeroall
-
-.Lno_data_vpmadd52_4x:
- .byte 0xf3,0xc3
-.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
-.type poly1305_blocks_vpmadd52_8x,@function
-.align 32
-poly1305_blocks_vpmadd52_8x:
- shrq $4,%rdx
- jz .Lno_data_vpmadd52_8x
-
- shlq $40,%rcx
- movq 64(%rdi),%r8
-
- vmovdqa64 .Lx_mask44(%rip),%ymm28
- vmovdqa64 .Lx_mask42(%rip),%ymm29
-
- testq %r8,%r8
- js .Linit_vpmadd52
-
- vmovq 0(%rdi),%xmm0
- vmovq 8(%rdi),%xmm1
- vmovq 16(%rdi),%xmm2
-
-.Lblocks_vpmadd52_8x:
-
-
-
- vmovdqu64 128(%rdi),%ymm5
- vmovdqu64 160(%rdi),%ymm16
- vmovdqu64 64(%rdi),%ymm3
- vmovdqu64 96(%rdi),%ymm4
-
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm17,%ymm17
-
- vpbroadcastq %xmm5,%ymm8
- vpbroadcastq %xmm3,%ymm6
- vpbroadcastq %xmm4,%ymm7
-
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm8,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm8,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm8,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm8,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm8,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm8,%ymm3,%ymm23
-
- vpmadd52luq %ymm6,%ymm3,%ymm18
- vpmadd52huq %ymm6,%ymm3,%ymm19
- vpmadd52luq %ymm6,%ymm4,%ymm20
- vpmadd52huq %ymm6,%ymm4,%ymm21
- vpmadd52luq %ymm6,%ymm5,%ymm22
- vpmadd52huq %ymm6,%ymm5,%ymm23
-
- vpmadd52luq %ymm7,%ymm17,%ymm18
- vpmadd52huq %ymm7,%ymm17,%ymm19
- vpmadd52luq %ymm7,%ymm3,%ymm20
- vpmadd52huq %ymm7,%ymm3,%ymm21
- vpmadd52luq %ymm7,%ymm4,%ymm22
- vpmadd52huq %ymm7,%ymm4,%ymm23
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm6
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm7
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm8
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm6,%ymm6
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm6,%ymm6
-
- vpsrlq $44,%ymm6,%ymm30
- vpandq %ymm28,%ymm6,%ymm6
-
- vpaddq %ymm30,%ymm7,%ymm7
-
-
-
-
-
- vpunpcklqdq %ymm5,%ymm8,%ymm26
- vpunpckhqdq %ymm5,%ymm8,%ymm5
- vpunpcklqdq %ymm3,%ymm6,%ymm24
- vpunpckhqdq %ymm3,%ymm6,%ymm3
- vpunpcklqdq %ymm4,%ymm7,%ymm25
- vpunpckhqdq %ymm4,%ymm7,%ymm4
- vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8
- vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6
- vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7
-
- vmovdqu64 0(%rsi),%zmm26
- vmovdqu64 64(%rsi),%zmm27
- leaq 128(%rsi),%rsi
-
- vpsllq $2,%zmm8,%zmm10
- vpsllq $2,%zmm7,%zmm9
- vpaddq %zmm8,%zmm10,%zmm10
- vpaddq %zmm7,%zmm9,%zmm9
- vpsllq $2,%zmm10,%zmm10
- vpsllq $2,%zmm9,%zmm9
-
- vpbroadcastq %rcx,%zmm31
- vpbroadcastq %xmm28,%zmm28
- vpbroadcastq %xmm29,%zmm29
-
- vpbroadcastq %xmm9,%zmm16
- vpbroadcastq %xmm10,%zmm17
- vpbroadcastq %xmm6,%zmm3
- vpbroadcastq %xmm7,%zmm4
- vpbroadcastq %xmm8,%zmm5
-
- vpunpcklqdq %zmm27,%zmm26,%zmm25
- vpunpckhqdq %zmm27,%zmm26,%zmm27
-
-
-
- vpsrlq $24,%zmm27,%zmm26
- vporq %zmm31,%zmm26,%zmm26
- vpaddq %zmm26,%zmm2,%zmm2
- vpandq %zmm28,%zmm25,%zmm24
- vpsrlq $44,%zmm25,%zmm25
- vpsllq $20,%zmm27,%zmm27
- vporq %zmm27,%zmm25,%zmm25
- vpandq %zmm28,%zmm25,%zmm25
-
- subq $8,%rdx
- jz .Ltail_vpmadd52_8x
- jmp .Loop_vpmadd52_8x
-
-.align 32
-.Loop_vpmadd52_8x:
-
- vpaddq %zmm24,%zmm0,%zmm0
- vpaddq %zmm25,%zmm1,%zmm1
-
- vpxorq %zmm18,%zmm18,%zmm18
- vpmadd52luq %zmm2,%zmm16,%zmm18
- vpxorq %zmm19,%zmm19,%zmm19
- vpmadd52huq %zmm2,%zmm16,%zmm19
- vpxorq %zmm20,%zmm20,%zmm20
- vpmadd52luq %zmm2,%zmm17,%zmm20
- vpxorq %zmm21,%zmm21,%zmm21
- vpmadd52huq %zmm2,%zmm17,%zmm21
- vpxorq %zmm22,%zmm22,%zmm22
- vpmadd52luq %zmm2,%zmm3,%zmm22
- vpxorq %zmm23,%zmm23,%zmm23
- vpmadd52huq %zmm2,%zmm3,%zmm23
-
- vmovdqu64 0(%rsi),%zmm26
- vmovdqu64 64(%rsi),%zmm27
- leaq 128(%rsi),%rsi
- vpmadd52luq %zmm0,%zmm3,%zmm18
- vpmadd52huq %zmm0,%zmm3,%zmm19
- vpmadd52luq %zmm0,%zmm4,%zmm20
- vpmadd52huq %zmm0,%zmm4,%zmm21
- vpmadd52luq %zmm0,%zmm5,%zmm22
- vpmadd52huq %zmm0,%zmm5,%zmm23
-
- vpunpcklqdq %zmm27,%zmm26,%zmm25
- vpunpckhqdq %zmm27,%zmm26,%zmm27
- vpmadd52luq %zmm1,%zmm17,%zmm18
- vpmadd52huq %zmm1,%zmm17,%zmm19
- vpmadd52luq %zmm1,%zmm3,%zmm20
- vpmadd52huq %zmm1,%zmm3,%zmm21
- vpmadd52luq %zmm1,%zmm4,%zmm22
- vpmadd52huq %zmm1,%zmm4,%zmm23
-
-
-
- vpsrlq $44,%zmm18,%zmm30
- vpsllq $8,%zmm19,%zmm19
- vpandq %zmm28,%zmm18,%zmm0
- vpaddq %zmm30,%zmm19,%zmm19
-
- vpsrlq $24,%zmm27,%zmm26
- vporq %zmm31,%zmm26,%zmm26
- vpaddq %zmm19,%zmm20,%zmm20
-
- vpsrlq $44,%zmm20,%zmm30
- vpsllq $8,%zmm21,%zmm21
- vpandq %zmm28,%zmm20,%zmm1
- vpaddq %zmm30,%zmm21,%zmm21
-
- vpandq %zmm28,%zmm25,%zmm24
- vpsrlq $44,%zmm25,%zmm25
- vpsllq $20,%zmm27,%zmm27
- vpaddq %zmm21,%zmm22,%zmm22
-
- vpsrlq $42,%zmm22,%zmm30
- vpsllq $10,%zmm23,%zmm23
- vpandq %zmm29,%zmm22,%zmm2
- vpaddq %zmm30,%zmm23,%zmm23
-
- vpaddq %zmm26,%zmm2,%zmm2
- vpaddq %zmm23,%zmm0,%zmm0
- vpsllq $2,%zmm23,%zmm23
-
- vpaddq %zmm23,%zmm0,%zmm0
- vporq %zmm27,%zmm25,%zmm25
- vpandq %zmm28,%zmm25,%zmm25
-
- vpsrlq $44,%zmm0,%zmm30
- vpandq %zmm28,%zmm0,%zmm0
-
- vpaddq %zmm30,%zmm1,%zmm1
-
- subq $8,%rdx
- jnz .Loop_vpmadd52_8x
-
-.Ltail_vpmadd52_8x:
-
- vpaddq %zmm24,%zmm0,%zmm0
- vpaddq %zmm25,%zmm1,%zmm1
-
- vpxorq %zmm18,%zmm18,%zmm18
- vpmadd52luq %zmm2,%zmm9,%zmm18
- vpxorq %zmm19,%zmm19,%zmm19
- vpmadd52huq %zmm2,%zmm9,%zmm19
- vpxorq %zmm20,%zmm20,%zmm20
- vpmadd52luq %zmm2,%zmm10,%zmm20
- vpxorq %zmm21,%zmm21,%zmm21
- vpmadd52huq %zmm2,%zmm10,%zmm21
- vpxorq %zmm22,%zmm22,%zmm22
- vpmadd52luq %zmm2,%zmm6,%zmm22
- vpxorq %zmm23,%zmm23,%zmm23
- vpmadd52huq %zmm2,%zmm6,%zmm23
-
- vpmadd52luq %zmm0,%zmm6,%zmm18
- vpmadd52huq %zmm0,%zmm6,%zmm19
- vpmadd52luq %zmm0,%zmm7,%zmm20
- vpmadd52huq %zmm0,%zmm7,%zmm21
- vpmadd52luq %zmm0,%zmm8,%zmm22
- vpmadd52huq %zmm0,%zmm8,%zmm23
-
- vpmadd52luq %zmm1,%zmm10,%zmm18
- vpmadd52huq %zmm1,%zmm10,%zmm19
- vpmadd52luq %zmm1,%zmm6,%zmm20
- vpmadd52huq %zmm1,%zmm6,%zmm21
- vpmadd52luq %zmm1,%zmm7,%zmm22
- vpmadd52huq %zmm1,%zmm7,%zmm23
-
-
-
-
- movl $1,%eax
- kmovw %eax,%k1
- vpsrldq $8,%zmm18,%zmm24
- vpsrldq $8,%zmm19,%zmm0
- vpsrldq $8,%zmm20,%zmm25
- vpsrldq $8,%zmm21,%zmm1
- vpaddq %zmm24,%zmm18,%zmm18
- vpaddq %zmm0,%zmm19,%zmm19
- vpsrldq $8,%zmm22,%zmm26
- vpsrldq $8,%zmm23,%zmm2
- vpaddq %zmm25,%zmm20,%zmm20
- vpaddq %zmm1,%zmm21,%zmm21
- vpermq $0x2,%zmm18,%zmm24
- vpermq $0x2,%zmm19,%zmm0
- vpaddq %zmm26,%zmm22,%zmm22
- vpaddq %zmm2,%zmm23,%zmm23
-
- vpermq $0x2,%zmm20,%zmm25
- vpermq $0x2,%zmm21,%zmm1
- vpaddq %zmm24,%zmm18,%zmm18
- vpaddq %zmm0,%zmm19,%zmm19
- vpermq $0x2,%zmm22,%zmm26
- vpermq $0x2,%zmm23,%zmm2
- vpaddq %zmm25,%zmm20,%zmm20
- vpaddq %zmm1,%zmm21,%zmm21
- vextracti64x4 $1,%zmm18,%ymm24
- vextracti64x4 $1,%zmm19,%ymm0
- vpaddq %zmm26,%zmm22,%zmm22
- vpaddq %zmm2,%zmm23,%zmm23
-
- vextracti64x4 $1,%zmm20,%ymm25
- vextracti64x4 $1,%zmm21,%ymm1
- vextracti64x4 $1,%zmm22,%ymm26
- vextracti64x4 $1,%zmm23,%ymm2
- vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
- vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
- vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
- vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
- vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
- vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
-
-
- vmovq %xmm0,0(%rdi)
- vmovq %xmm1,8(%rdi)
- vmovq %xmm2,16(%rdi)
- vzeroall
-
-.Lno_data_vpmadd52_8x:
- .byte 0xf3,0xc3
-.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
-.type poly1305_emit_base2_44,@function
-.align 32
-poly1305_emit_base2_44:
- movq 0(%rdi),%r8
- movq 8(%rdi),%r9
- movq 16(%rdi),%r10
-
- movq %r9,%rax
- shrq $20,%r9
- shlq $44,%rax
- movq %r10,%rcx
- shrq $40,%r10
- shlq $24,%rcx
-
- addq %rax,%r8
- adcq %rcx,%r9
- adcq $0,%r10
-
- movq %r8,%rax
- addq $5,%r8
- movq %r9,%rcx
- adcq $0,%r9
- adcq $0,%r10
- shrq $2,%r10
- cmovnzq %r8,%rax
- cmovnzq %r9,%rcx
-
- addq 0(%rdx),%rax
- adcq 8(%rdx),%rcx
- movq %rax,0(%rsi)
- movq %rcx,8(%rsi)
-
- .byte 0xf3,0xc3
-.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long 16777216,0,16777216,0,16777216,0,16777216,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+ leaq -8(%r10),%rsp

-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
+ ret

-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 16
-.globl xor128_encrypt_n_pad
-.type xor128_encrypt_n_pad,@function
-.align 16
-xor128_encrypt_n_pad:
- subq %rdx,%rsi
- subq %rdx,%rdi
- movq %rcx,%r10
- shrq $4,%rcx
- jz .Ltail_enc
- nop
-.Loop_enc_xmm:
- movdqu (%rsi,%rdx,1),%xmm0
- pxor (%rdx),%xmm0
- movdqu %xmm0,(%rdi,%rdx,1)
- movdqa %xmm0,(%rdx)
- leaq 16(%rdx),%rdx
- decq %rcx
- jnz .Loop_enc_xmm
-
- andq $15,%r10
- jz .Ldone_enc
-
-.Ltail_enc:
- movq $16,%rcx
- subq %r10,%rcx
- xorl %eax,%eax
-.Loop_enc_byte:
- movb (%rsi,%rdx,1),%al
- xorb (%rdx),%al
- movb %al,(%rdi,%rdx,1)
- movb %al,(%rdx)
- leaq 1(%rdx),%rdx
- decq %r10
- jnz .Loop_enc_byte
-
- xorl %eax,%eax
-.Loop_enc_pad:
- movb %al,(%rdx)
- leaq 1(%rdx),%rdx
- decq %rcx
- jnz .Loop_enc_pad
-
-.Ldone_enc:
- movq %rdx,%rax
- .byte 0xf3,0xc3
-.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
-
-.globl xor128_decrypt_n_pad
-.type xor128_decrypt_n_pad,@function
-.align 16
-xor128_decrypt_n_pad:
- subq %rdx,%rsi
- subq %rdx,%rdi
- movq %rcx,%r10
- shrq $4,%rcx
- jz .Ltail_dec
- nop
-.Loop_dec_xmm:
- movdqu (%rsi,%rdx,1),%xmm0
- movdqa (%rdx),%xmm1
- pxor %xmm0,%xmm1
- movdqu %xmm1,(%rdi,%rdx,1)
- movdqa %xmm0,(%rdx)
- leaq 16(%rdx),%rdx
- decq %rcx
- jnz .Loop_dec_xmm
-
- pxor %xmm1,%xmm1
- andq $15,%r10
- jz .Ldone_dec
-
-.Ltail_dec:
- movq $16,%rcx
- subq %r10,%rcx
- xorl %eax,%eax
- xorq %r11,%r11
-.Loop_dec_byte:
- movb (%rsi,%rdx,1),%r11b
- movb (%rdx),%al
- xorb %r11b,%al
- movb %al,(%rdi,%rdx,1)
- movb %r11b,(%rdx)
- leaq 1(%rdx),%rdx
- decq %r10
- jnz .Loop_dec_byte
-
- xorl %eax,%eax
-.Loop_dec_pad:
- movb %al,(%rdx)
- leaq 1(%rdx),%rdx
- decq %rcx
- jnz .Loop_dec_pad
-
-.Ldone_dec:
- movq %rdx,%rax
- .byte 0xf3,0xc3
-.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+ENDPROC(poly1305_blocks_avx512)
+#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
index d4922d230c4f..0a580f1328fc 100644
--- a/lib/zinc/poly1305/poly1305.c
+++ b/lib/zinc/poly1305/poly1305.c
@@ -16,6 +16,9 @@
#include <linux/module.h>
#include <linux/init.h>

+#if defined(CONFIG_ZINC_ARCH_X86_64)
+#include "poly1305-x86_64-glue.c"
+#else
static inline bool poly1305_init_arch(void *ctx,
const u8 key[POLY1305_KEY_SIZE])
{
@@ -37,6 +40,7 @@ static bool *const poly1305_nobs[] __initconst = { };
static void __init poly1305_fpu_init(void)
{
}
+#endif

#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
#include "poly1305-donna64.c"
--
2.19.1