[PATCH net-next v7 15/28] zinc: Poly1305 ARM and ARM64 implementations

From: Jason A. Donenfeld
Date: Fri Oct 05 2018 - 22:58:30 EST


These wire Andy Polyakov's implementations up to the kernel. We make a
few small changes to the assembly:

- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed to fit kernel conventions.
- The neon code can jump to the scalar code when it makes sense to do
so.

The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit
and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON
and then having to go back to scalar -- because the user is silly and has
called the update function from two separate contexts -- then we need to
convert back to the original base before proceeding. It is possible to
reason that the initial reduction below is sufficient given the
implementation invariants. However, for an avoidance of doubt and because
this is not performance critical, we do the full reduction anyway. This
conversion is found in the glue code, and a proof of correctness may be
easily obtained from Z3: <https://xn--4db.cc/ltPtHCKN/py>.

Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
Cc: Russell King <linux@xxxxxxxxxxxxxxx>
Cc: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx
Cc: Samuel Neves <sneves@xxxxxxxxx>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@xxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: kernel-hardening@xxxxxxxxxxxxxxxxxx
Cc: linux-crypto@xxxxxxxxxxxxxxx
---
lib/zinc/Makefile | 2 +
lib/zinc/poly1305/poly1305-arm-glue.c | 140 +++++++++++++++++
...ly1305-arm-cryptogams.S => poly1305-arm.S} | 147 ++++++------------
...05-arm64-cryptogams.S => poly1305-arm64.S} | 127 +++++----------
lib/zinc/poly1305/poly1305.c | 2 +
5 files changed, 231 insertions(+), 187 deletions(-)
create mode 100644 lib/zinc/poly1305/poly1305-arm-glue.c
rename lib/zinc/poly1305/{poly1305-arm-cryptogams.S => poly1305-arm.S} (91%)
rename lib/zinc/poly1305/{poly1305-arm64-cryptogams.S => poly1305-arm64.S} (89%)

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index a8943d960b6a..c09fd3de60f9 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -12,4 +12,6 @@ obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o

zinc_poly1305-y := poly1305/poly1305.o
zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM) += poly1305/poly1305-arm.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM64) += poly1305/poly1305-arm64.o
obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o
diff --git a/lib/zinc/poly1305/poly1305-arm-glue.c b/lib/zinc/poly1305/poly1305-arm-glue.c
new file mode 100644
index 000000000000..f4f08ecffbf6
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm-glue.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
+asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
+
+static bool poly1305_use_neon __ro_after_init;
+static bool *const poly1305_nobs[] __initconst = { &poly1305_use_neon };
+
+static void __init poly1305_fpu_init(void)
+{
+#if defined(CONFIG_ZINC_ARCH_ARM64)
+ poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ZINC_ARCH_ARM)
+ poly1305_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+#if defined(CONFIG_ZINC_ARCH_ARM64)
+struct poly1305_arch_internal {
+ union {
+ u32 h[5];
+ struct {
+ u64 h0, h1, h2;
+ };
+ };
+ u64 is_base2_26;
+ u64 r[2];
+};
+#elif defined(CONFIG_ZINC_ARCH_ARM)
+struct poly1305_arch_internal {
+ union {
+ u32 h[5];
+ struct {
+ u64 h0, h1;
+ u32 h2;
+ } __packed;
+ };
+ u32 r[4];
+ u32 is_base2_26;
+};
+#endif
+
+/* The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit
+ * and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON
+ * and then having to go back to scalar -- because the user is silly and has
+ * called the update function from two separate contexts -- then we need to
+ * convert back to the original base before proceeding. The below function is
+ * written for 64-bit integers, and so we have to swap words at the end on
+ * big-endian 32-bit. It is possible to reason that the initial reduction below
+ * is sufficient given the implementation invariants. However, for an avoidance
+ * of doubt and because this is not performance critical, we do the full
+ * reduction anyway.
+ */
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->h2 = state->h[4] >> 24;
+ if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) {
+ state->h0 = rol64(state->h0, 32);
+ state->h1 = rol64(state->h1, 32);
+ }
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->h2 >> 2) + (state->h2 & ~3ULL);
+ state->h2 &= 3;
+ state->h0 += cy;
+ state->h1 += (cy = ULT(state->h0, cy));
+ state->h2 += ULT(state->h1, cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_arm(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ size_t len, const u32 padbit,
+ simd_context_t *simd_context)
+{
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+ PAGE_SIZE % POLY1305_BLOCK_SIZE);
+
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
+ !simd_use(simd_context)) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_arm(ctx, inp, len, padbit);
+ return true;
+ }
+
+ for (;;) {
+ const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+ poly1305_blocks_neon(ctx, inp, bytes, padbit);
+ len -= bytes;
+ if (!len)
+ break;
+ inp += bytes;
+ simd_relax(simd_context);
+ }
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t *simd_context)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
+ !simd_use(simd_context)) {
+ convert_to_base2_64(ctx);
+ poly1305_emit_arm(ctx, mac, nonce);
+ } else
+ poly1305_emit_neon(ctx, mac, nonce);
+ return true;
+}
diff --git a/lib/zinc/poly1305/poly1305-arm-cryptogams.S b/lib/zinc/poly1305/poly1305-arm.S
similarity index 91%
rename from lib/zinc/poly1305/poly1305-arm-cryptogams.S
rename to lib/zinc/poly1305/poly1305-arm.S
index 884b465030e4..4a0e9d451119 100644
--- a/lib/zinc/poly1305/poly1305-arm-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-arm.S
@@ -1,9 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/

-#include "arm_arch.h"
+#include <linux/linkage.h>

.text
#if defined(__thumb2__)
@@ -13,13 +16,8 @@
.code 32
#endif

-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
.align 5
-poly1305_init:
-.Lpoly1305_init:
+ENTRY(poly1305_init_arm)
stmdb sp!,{r4-r11}

eor r3,r3,r3
@@ -38,10 +36,6 @@ poly1305_init:
moveq r0,#0
beq .Lno_key

-#if __ARM_MAX_ARCH__>=7
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-#endif
ldrb r4,[r1,#0]
mov r10,#0x0fffffff
ldrb r5,[r1,#1]
@@ -56,12 +50,6 @@ poly1305_init:
ldrb r7,[r1,#6]
and r4,r4,r10

-#if __ARM_MAX_ARCH__>=7
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
- ldr r12,[r12]
-# endif
-#endif
ldrb r8,[r1,#7]
orr r5,r5,r6,lsl#8
ldrb r6,[r1,#8]
@@ -71,35 +59,6 @@ poly1305_init:
ldrb r8,[r1,#10]
and r5,r5,r3

-#if __ARM_MAX_ARCH__>=7
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
- movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
- movne r12,r10
-# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
-# endif
-#endif
ldrb r9,[r1,#11]
orr r6,r6,r7,lsl#8
ldrb r7,[r1,#12]
@@ -118,26 +77,20 @@ poly1305_init:
str r6,[r0,#8]
and r7,r7,r3
str r7,[r0,#12]
-#if __ARM_MAX_ARCH__>=7
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
.Lno_key:
ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
bx lr @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size poly1305_init,.-poly1305_init
-.type poly1305_blocks,%function
+ENDPROC(poly1305_init_arm)
+
.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
stmdb sp!,{r3-r11,lr}

ands r2,r2,#-16
@@ -158,11 +111,11 @@ poly1305_blocks:
b .Loop

.Loop:
-#if __ARM_ARCH__<7
+#if __LINUX_ARM_ARCH__ < 7
ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
addhi r8,r8,#1 @ 1<<128
ldrb r1,[lr,#-15]
ldrb r2,[lr,#-14]
@@ -201,19 +154,19 @@ poly1305_blocks:
orr r3,r2,r3,lsl#24
#else
ldr r0,[lr],#16 @ load input
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
addhi r8,r8,#1 @ padbit
ldr r1,[lr,#-12]
ldr r2,[lr,#-8]
ldr r3,[lr,#-4]
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
-# endif
+#endif
adds r4,r4,r0 @ accumulate input
str lr,[sp,#8] @ offload input pointer
adcs r5,r5,r1
@@ -283,7 +236,7 @@ poly1305_blocks:
stmia r0,{r4-r8} @ store the result

.Lno_data:
-#if __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
ldmia sp!,{r3-r11,pc}
#else
ldmia sp!,{r3-r11,lr}
@@ -291,13 +244,12 @@ poly1305_blocks:
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size poly1305_blocks,.-poly1305_blocks
-.type poly1305_emit,%function
+ENDPROC(poly1305_blocks_arm)
+
.align 5
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
-
ldmia r0,{r3-r7}
adds r8,r3,#5 @ compare to modulus
adcs r9,r4,#0
@@ -332,13 +284,13 @@ poly1305_emit:
adcs r5,r5,r10
adc r6,r6,r11

-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
+#if __LINUX_ARM_ARCH__ >= 7
+#ifdef __ARMEB__
rev r3,r3
rev r4,r4
rev r5,r5
rev r6,r6
-# endif
+#endif
str r3,[r1,#0]
str r4,[r1,#4]
str r5,[r1,#8]
@@ -377,20 +329,22 @@ poly1305_emit:
strb r6,[r1,#15]
#endif
ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
bx lr @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size poly1305_emit,.-poly1305_emit
-#if __ARM_MAX_ARCH__>=7
+ENDPROC(poly1305_emit_arm)
+
+
+#ifdef CONFIG_KERNEL_MODE_NEON
.fpu neon

-.type poly1305_init_neon,%function
.align 5
-poly1305_init_neon:
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
ldr r4,[r0,#20] @ load key base 2^32
ldr r5,[r0,#24]
ldr r6,[r0,#28]
@@ -600,11 +554,10 @@ poly1305_init_neon:
vst1.32 {d8[1]},[r7]

bx lr @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
+ENDPROC(poly1305_init_neon)

-.type poly1305_blocks_neon,%function
.align 5
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
ldr ip,[r0,#36] @ is_base2_26
ands r2,r2,#-16
beq .Lno_data_neon
@@ -612,7 +565,7 @@ poly1305_blocks_neon:
cmp r2,#64
bhs .Lenter_neon
tst ip,ip @ is_base2_26?
- beq .Lpoly1305_blocks
+ beq .Lpoly1305_blocks_arm

.Lenter_neon:
stmdb sp!,{r4-r7}
@@ -622,7 +575,7 @@ poly1305_blocks_neon:
bne .Lbase2_26_neon

stmdb sp!,{r1-r3,lr}
- bl poly1305_init_neon
+ bl .Lpoly1305_init_neon

ldr r4,[r0,#0] @ load hash value base 2^32
ldr r5,[r0,#4]
@@ -686,12 +639,12 @@ poly1305_blocks_neon:
sub r2,r2,#16
add r4,r1,#32

-# ifdef __ARMEB__
+#ifdef __ARMEB__
vrev32.8 q10,q10
vrev32.8 q13,q13
vrev32.8 q11,q11
vrev32.8 q12,q12
-# endif
+#endif
vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
vshl.u32 d26,d26,#18

@@ -735,12 +688,12 @@ poly1305_blocks_neon:
addhi r7,r0,#(48+1*9*4)
addhi r6,r0,#(48+3*9*4)

-# ifdef __ARMEB__
+#ifdef __ARMEB__
vrev32.8 q10,q10
vrev32.8 q13,q13
vrev32.8 q11,q11
vrev32.8 q12,q12
-# endif
+#endif
vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
vshl.u32 q13,q13,#18

@@ -866,12 +819,12 @@ poly1305_blocks_neon:

vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
add r1,r1,#64
-# ifdef __ARMEB__
+#ifdef __ARMEB__
vrev32.8 q10,q10
vrev32.8 q11,q11
vrev32.8 q12,q12
vrev32.8 q13,q13
-# endif
+#endif

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction interleaved with base 2^32 -> base 2^26 of
@@ -1086,11 +1039,10 @@ poly1305_blocks_neon:
ldmia sp!,{r4-r7}
.Lno_data_neon:
bx lr @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
+ENDPROC(poly1305_blocks_neon)

-.type poly1305_emit_neon,%function
.align 5
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
ldr ip,[r0,#36] @ is_base2_26

stmdb sp!,{r4-r11}
@@ -1144,12 +1096,12 @@ poly1305_emit_neon:
adcs r5,r5,r10
adc r6,r6,r11

-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r3,r3
rev r4,r4
rev r5,r5
rev r6,r6
-# endif
+#endif
str r3,[r1,#0] @ store the result
str r4,[r1,#4]
str r5,[r1,#8]
@@ -1157,16 +1109,9 @@ poly1305_emit_neon:

ldmia sp!,{r4-r11}
bx lr @ bx lr
-.size poly1305_emit_neon,.-poly1305_emit_neon
+ENDPROC(poly1305_emit_neon)

.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lpoly1305_init
-#endif
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro@xxxxxxxxxxx>"
-.align 2
-#if __ARM_MAX_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
#endif
diff --git a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S b/lib/zinc/poly1305/poly1305-arm64.S
similarity index 89%
rename from lib/zinc/poly1305/poly1305-arm64-cryptogams.S
rename to lib/zinc/poly1305/poly1305-arm64.S
index 0ecb50a83ec0..5f4e7fb0a836 100644
--- a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-arm64.S
@@ -1,21 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@xxxxxxxxxxx>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/

-#include "arm_arch.h"
-
+#include <linux/linkage.h>
.text

-// forward "declarations" are required for Apple
-
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type poly1305_init,%function
.align 5
-poly1305_init:
+ENTRY(poly1305_init_arm)
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
@@ -23,18 +18,10 @@ poly1305_init:
csel x0,xzr,x0,eq
b.eq .Lno_key

-#ifdef __ILP32__
- ldrsw x11,.LOPENSSL_armcap_P
-#else
- ldr x11,.LOPENSSL_armcap_P
-#endif
- adr x10,.LOPENSSL_armcap_P
-
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
movk x9,#0x0fff,lsl#48
- ldr w17,[x10,x11]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x7,x7 // flip bytes
rev x8,x8
#endif
@@ -43,30 +30,12 @@ poly1305_init:
and x8,x8,x9 // &=0ffffffc0ffffffc
stp x7,x8,[x0,#32] // save key value

- tst w17,#ARMV7_NEON
-
- adr x12,poly1305_blocks
- adr x7,poly1305_blocks_neon
- adr x13,poly1305_emit
- adr x8,poly1305_emit_neon
-
- csel x12,x12,x7,eq
- csel x13,x13,x8,eq
-
-#ifdef __ILP32__
- stp w12,w13,[x2]
-#else
- stp x12,x13,[x2]
-#endif
-
- mov x0,#1
.Lno_key:
ret
-.size poly1305_init,.-poly1305_init
+ENDPROC(poly1305_init_arm)

-.type poly1305_blocks,%function
.align 5
-poly1305_blocks:
+ENTRY(poly1305_blocks_arm)
ands x2,x2,#-16
b.eq .Lno_data

@@ -80,7 +49,7 @@ poly1305_blocks:
.Loop:
ldp x10,x11,[x1],#16 // load input
sub x2,x2,#16
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x10,x10
rev x11,x11
#endif
@@ -126,11 +95,10 @@ poly1305_blocks:

.Lno_data:
ret
-.size poly1305_blocks,.-poly1305_blocks
+ENDPROC(poly1305_blocks_arm)

-.type poly1305_emit,%function
.align 5
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
ldp x4,x5,[x0] // load hash base 2^64
ldr x6,[x0,#16]
ldp x10,x11,[x2] // load nonce
@@ -144,23 +112,23 @@ poly1305_emit:
csel x4,x4,x12,eq
csel x5,x5,x13,eq

-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror x10,x10,#32 // flip nonce words
ror x11,x11,#32
#endif
adds x4,x4,x10 // accumulate nonce
adc x5,x5,x11
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x4,x4 // flip output bytes
rev x5,x5
#endif
stp x4,x5,[x1] // write result

ret
-.size poly1305_emit,.-poly1305_emit
-.type poly1305_mult,%function
+ENDPROC(poly1305_emit_arm)
+
.align 5
-poly1305_mult:
+__poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7

@@ -193,11 +161,8 @@ poly1305_mult:
adc x6,x6,xzr

ret
-.size poly1305_mult,.-poly1305_mult

-.type poly1305_splat,%function
-.align 5
-poly1305_splat:
+__poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
@@ -220,15 +185,14 @@ poly1305_splat:
str w15,[x0,#16*8] // s4

ret
-.size poly1305_splat,.-poly1305_splat

-.type poly1305_blocks_neon,%function
+#ifdef CONFIG_KERNEL_MODE_NEON
.align 5
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
ldr x17,[x0,#24]
cmp x2,#128
b.hs .Lblocks_neon
- cbz x17,poly1305_blocks
+ cbz x17,poly1305_blocks_arm

.Lblocks_neon:
stp x29,x30,[sp,#-80]!
@@ -268,7 +232,7 @@ poly1305_blocks_neon:
adcs x5,x5,xzr
adc x6,x6,xzr

-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x12,x12
rev x13,x13
#endif
@@ -276,7 +240,7 @@ poly1305_blocks_neon:
adcs x5,x5,x13
adc x6,x6,x3

- bl poly1305_mult
+ bl __poly1305_mult
ldr x30,[sp,#8]

cbz x3,.Lstore_base2_64_neon
@@ -314,7 +278,7 @@ poly1305_blocks_neon:
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x12,x12
rev x13,x13
#endif
@@ -322,7 +286,7 @@ poly1305_blocks_neon:
adcs x5,x5,x13
adc x6,x6,x3

- bl poly1305_mult
+ bl __poly1305_mult

.Linit_neon:
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
@@ -349,19 +313,19 @@ poly1305_blocks_neon:
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
- bl poly1305_splat
+ bl __poly1305_splat

- bl poly1305_mult // r^2
+ bl __poly1305_mult // r^2
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat

- bl poly1305_mult // r^3
+ bl __poly1305_mult // r^3
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat

- bl poly1305_mult // r^4
+ bl __poly1305_mult // r^4
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
ldr x30,[sp,#8]

add x16,x1,#32
@@ -399,7 +363,7 @@ poly1305_blocks_neon:
lsl x3,x3,#24
add x15,x0,#48

-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -435,7 +399,7 @@ poly1305_blocks_neon:
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
ld1 {v8.4s},[x15]

-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -496,7 +460,7 @@ poly1305_blocks_neon:
umull v20.2d,v14.2s,v1.s[2]
ldp x9,x13,[x16],#48
umull v19.2d,v14.2s,v0.s[2]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -561,7 +525,7 @@ poly1305_blocks_neon:
umlal v23.2d,v11.2s,v3.s[0]
umlal v20.2d,v11.2s,v8.s[0]
umlal v21.2d,v11.2s,v0.s[0]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -801,13 +765,12 @@ poly1305_blocks_neon:
.Lno_data_neon:
ldr x29,[sp],#80
ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
+ENDPROC(poly1305_blocks_neon)

-.type poly1305_emit_neon,%function
.align 5
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
ldr x17,[x0,#24]
- cbz x17,poly1305_emit
+ cbz x17,poly1305_emit_arm

ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
@@ -840,30 +803,22 @@ poly1305_emit_neon:
csel x4,x4,x12,eq
csel x5,x5,x13,eq

-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror x10,x10,#32 // flip nonce words
ror x11,x11,#32
#endif
adds x4,x4,x10 // accumulate nonce
adc x5,x5,x11
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x4,x4 // flip output bytes
rev x5,x5
#endif
stp x4,x5,[x1] // write result

ret
-.size poly1305_emit_neon,.-poly1305_emit_neon
+ENDPROC(poly1305_emit_neon)

.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
#endif
-.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
index 51af7045cac8..9dc85f62e806 100644
--- a/lib/zinc/poly1305/poly1305.c
+++ b/lib/zinc/poly1305/poly1305.c
@@ -18,6 +18,8 @@

#if defined(CONFIG_ZINC_ARCH_X86_64)
#include "poly1305-x86_64-glue.c"
+#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
+#include "poly1305-arm-glue.c"
#else
static inline bool poly1305_init_arch(void *ctx,
const u8 key[POLY1305_KEY_SIZE])
--
2.19.0