[RFC/RFT PATCH] crypto: arm64/chacha - optimize for arbitrary length inputs

From: Ard Biesheuvel
Date: Fri Nov 30 2018 - 15:46:08 EST


Update the 4-way NEON ChaCha routine so it can handle input of any
length >64 bytes in its entirety, rather than having to call into
the 1-way routine and/or do memcpy()s via temp buffers to handle the
tail of a ChaCha invocation that is not a multiple of 256 bytes.

On inputs that are a multiple of 256 bytes (and thus in tcrypt
benchmarks), performance drops by around 1% on Cortex-A57, while
performance for inputs drawn randomly from the range [64, 1024+64)
increases by around 30% (using ChaCha20). On Cortex-A72, performance
gains are similar. On Cortex-A53, performance improves but only by 5%.

Cc: Eric Biggers <ebiggers@xxxxxxxxxx>
Cc: Martin Willi <martin@xxxxxxxxxxxxxx>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
---
Test program after the patch.

arch/arm64/crypto/chacha-neon-core.S | 185 ++++++++++++++++++--
arch/arm64/crypto/chacha-neon-glue.c | 36 ++--
2 files changed, 188 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 75b4e06cee79..45ffc51cb437 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -19,6 +19,7 @@
*/

#include <linux/linkage.h>
+#include <asm/cache.h>

.text
.align 6
@@ -164,6 +165,7 @@ ENTRY(chacha_4block_xor_neon)
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
// w3: nrounds
+ // x4: byte count

//
// This function encrypts four consecutive ChaCha blocks by loading
@@ -177,11 +179,11 @@ ENTRY(chacha_4block_xor_neon)
ld1 {v30.4s-v31.4s}, [x9]

// x0..15[0-3] = s0..3[0..3]
- mov x4, x0
- ld4r { v0.4s- v3.4s}, [x4], #16
- ld4r { v4.4s- v7.4s}, [x4], #16
- ld4r { v8.4s-v11.4s}, [x4], #16
- ld4r {v12.4s-v15.4s}, [x4]
+ mov x8, x0
+ ld4r { v0.4s- v3.4s}, [x8], #16
+ ld4r { v4.4s- v7.4s}, [x8], #16
+ ld4r { v8.4s-v11.4s}, [x8], #16
+ ld4r {v12.4s-v15.4s}, [x8]

// x12 += counter values 0-3
add v12.4s, v12.4s, v30.4s
@@ -425,24 +427,47 @@ ENTRY(chacha_4block_xor_neon)
zip1 v30.4s, v14.4s, v15.4s
zip2 v31.4s, v14.4s, v15.4s

+ mov x3, #64
+ subs x5, x4, #64
+ add x6, x5, x2
+ csel x3, x3, xzr, ge
+ csel x2, x2, x6, ge
+
// interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d
zip2 v4.2d, v16.2d, v18.2d
zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d
- ld1 {v16.16b-v19.16b}, [x2], #64
+ ld1 {v16.16b-v19.16b}, [x2], x3
+
+ subs x6, x4, #128
+ ccmp x3, xzr, #4, lt
+ add x7, x6, x2
+ csel x3, x3, xzr, eq
+ csel x2, x2, x7, eq

zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d
zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d
- ld1 {v20.16b-v23.16b}, [x2], #64
+ ld1 {v20.16b-v23.16b}, [x2], x3
+
+ subs x7, x4, #192
+ ccmp x3, xzr, #4, lt
+ add x8, x7, x2
+ csel x3, x3, xzr, eq
+ csel x2, x2, x8, eq

zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d
zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d
- ld1 {v24.16b-v27.16b}, [x2], #64
+ ld1 {v24.16b-v27.16b}, [x2], x3
+
+ subs x8, x4, #256
+ ccmp x3, xzr, #4, lt
+ add x9, x8, x2
+ csel x2, x2, x9, eq

zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d
@@ -451,29 +476,167 @@ ENTRY(chacha_4block_xor_neon)
ld1 {v28.16b-v31.16b}, [x2]

// xor with corresponding input, write to output
+ tbnz x5, #63, 0f
eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
+ st1 {v16.16b-v19.16b}, [x1], #64
+
+ tbnz x6, #63, 1f
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
- st1 {v16.16b-v19.16b}, [x1], #64
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+
+ tbnz x7, #63, 2f
eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b
- st1 {v20.16b-v23.16b}, [x1], #64
eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b
- eor v28.16b, v28.16b, v12.16b
st1 {v24.16b-v27.16b}, [x1], #64
+
+ tbnz x8, #63, 3f
+ eor v28.16b, v28.16b, v12.16b
eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b
st1 {v28.16b-v31.16b}, [x1]

ret
+
+ // fewer than 64 bytes of in/output
+0: adr x12, .Lpermute
+ add x12, x12, x5
+ sub x2, x1, #64
+ add x1, x1, x5
+ add x13, x12, #64
+ ld1 {v8.16b}, [x12]
+ ld1 {v9.16b}, [x13]
+ movi v10.16b, #16
+
+ ld1 {v16.16b-v19.16b}, [x2]
+ tbl v4.16b, {v0.16b-v3.16b}, v8.16b
+ tbx v20.16b, {v16.16b-v19.16b}, v9.16b
+ add v8.16b, v8.16b, v10.16b
+ add v9.16b, v9.16b, v10.16b
+ tbl v5.16b, {v0.16b-v3.16b}, v8.16b
+ tbx v21.16b, {v16.16b-v19.16b}, v9.16b
+ add v8.16b, v8.16b, v10.16b
+ add v9.16b, v9.16b, v10.16b
+ tbl v6.16b, {v0.16b-v3.16b}, v8.16b
+ tbx v22.16b, {v16.16b-v19.16b}, v9.16b
+ add v8.16b, v8.16b, v10.16b
+ add v9.16b, v9.16b, v10.16b
+ tbl v7.16b, {v0.16b-v3.16b}, v8.16b
+ tbx v23.16b, {v16.16b-v19.16b}, v9.16b
+
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v6.16b
+ eor v23.16b, v23.16b, v7.16b
+ st1 {v20.16b-v23.16b}, [x1]
+ ret
+
+ // fewer than 128 bytes of in/output
+1: adr x12, .Lpermute
+ add x12, x12, x6
+ add x1, x1, x6
+ add x13, x12, #64
+ ld1 {v8.16b}, [x12]
+ ld1 {v9.16b}, [x13]
+ movi v10.16b, #16
+ tbl v0.16b, {v4.16b-v7.16b}, v8.16b
+ tbx v20.16b, {v16.16b-v19.16b}, v9.16b
+ add v8.16b, v8.16b, v10.16b
+ add v9.16b, v9.16b, v10.16b
+ tbl v1.16b, {v4.16b-v7.16b}, v8.16b
+ tbx v21.16b, {v16.16b-v19.16b}, v9.16b
+ add v8.16b, v8.16b, v10.16b
+ add v9.16b, v9.16b, v10.16b
+ tbl v2.16b, {v4.16b-v7.16b}, v8.16b
+ tbx v22.16b, {v16.16b-v19.16b}, v9.16b
+ add v8.16b, v8.16b, v10.16b
+ add v9.16b, v9.16b, v10.16b
+ tbl v3.16b, {v4.16b-v7.16b}, v8.16b
+ tbx v23.16b, {v16.16b-v19.16b}, v9.16b
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v1.16b
+ eor v22.16b, v22.16b, v2.16b
+ eor v23.16b, v23.16b, v3.16b
+ st1 {v20.16b-v23.16b}, [x1]
+ ret
+
+ // fewer than 192 bytes of in/output
+2: adr x12, .Lpermute
+ add x12, x12, x7
+ add x1, x1, x7
+ add x13, x12, #64
+ ld1 {v4.16b}, [x12]
+ ld1 {v5.16b}, [x13]
+ movi v6.16b, #16
+ tbl v0.16b, {v8.16b-v11.16b}, v4.16b
+ tbx v24.16b, {v20.16b-v23.16b}, v5.16b
+ add v4.16b, v4.16b, v6.16b
+ add v5.16b, v5.16b, v6.16b
+ tbl v1.16b, {v8.16b-v11.16b}, v4.16b
+ tbx v25.16b, {v20.16b-v23.16b}, v5.16b
+ add v4.16b, v4.16b, v6.16b
+ add v5.16b, v5.16b, v6.16b
+ tbl v2.16b, {v8.16b-v11.16b}, v4.16b
+ tbx v26.16b, {v20.16b-v23.16b}, v5.16b
+ add v4.16b, v4.16b, v6.16b
+ add v5.16b, v5.16b, v6.16b
+ tbl v3.16b, {v8.16b-v11.16b}, v4.16b
+ tbx v27.16b, {v20.16b-v23.16b}, v5.16b
+
+ eor v24.16b, v24.16b, v0.16b
+ eor v25.16b, v25.16b, v1.16b
+ eor v26.16b, v26.16b, v2.16b
+ eor v27.16b, v27.16b, v3.16b
+ st1 {v24.16b-v27.16b}, [x1]
+ ret
+
+ // fewer than 256 bytes of in/output
+3: adr x12, .Lpermute
+ add x12, x12, x8
+ add x1, x1, x8
+ add x13, x12, #64
+ ld1 {v4.16b}, [x12]
+ ld1 {v5.16b}, [x13]
+ movi v6.16b, #16
+ tbl v0.16b, {v12.16b-v15.16b}, v4.16b
+ tbx v28.16b, {v24.16b-v27.16b}, v5.16b
+ add v4.16b, v4.16b, v6.16b
+ add v5.16b, v5.16b, v6.16b
+ tbl v1.16b, {v12.16b-v15.16b}, v4.16b
+ tbx v29.16b, {v24.16b-v27.16b}, v5.16b
+ add v4.16b, v4.16b, v6.16b
+ add v5.16b, v5.16b, v6.16b
+ tbl v2.16b, {v12.16b-v15.16b}, v4.16b
+ tbx v30.16b, {v24.16b-v27.16b}, v5.16b
+ add v4.16b, v4.16b, v6.16b
+ add v5.16b, v5.16b, v6.16b
+ tbl v3.16b, {v12.16b-v15.16b}, v4.16b
+ tbx v31.16b, {v24.16b-v27.16b}, v5.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x1]
+ ret
ENDPROC(chacha_4block_xor_neon)

CTRINC: .word 0, 1, 2, 3
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
+
+ .align L1_CACHE_SHIFT
+ .set .Lpermute, . + 64
+ .set .Li, 0
+ .rept 192
+ .byte (.Li - 64)
+ .set .Li, .Li + 1
+ .endr
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 346eb85498a1..458d9b36cf9d 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -32,41 +32,33 @@
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
- int nrounds);
+ int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);

static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
+ int bytes, int nrounds)
{
u8 buf[CHACHA_BLOCK_SIZE];

- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ if (bytes < CHACHA_BLOCK_SIZE) {
+ memcpy(buf, src, bytes);
kernel_neon_begin();
- chacha_4block_xor_neon(state, dst, src, nrounds);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ kernel_neon_end();
+ memcpy(dst, buf, bytes);
+ return;
+ }
+
+ while (bytes > 0) {
+ kernel_neon_begin();
+ chacha_4block_xor_neon(state, dst, src, nrounds,
+ min(bytes, CHACHA_BLOCK_SIZE * 4));
kernel_neon_end();
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
-
- if (!bytes)
- return;
-
- kernel_neon_begin();
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
- }
- kernel_neon_end();
}

static int chacha_neon_stream_xor(struct skcipher_request *req,
--
2.19.1


#include <stdlib.h>
#include <string.h>

extern void chacha_4block_xor_neon(unsigned int *state, unsigned char *dst,
unsigned char *src, int rounds, int bytes);

extern void chacha_block_xor_neon(unsigned int *state, unsigned char *dst,
unsigned char *src, int rounds);

int main(void)
{
static char buf[1024];
unsigned int state[64];

srand(20181130);

for (int i = 0; i < 10 * 1000 * 1000; i++) {
int l = 64 + rand() % (1024 - 64);

#ifdef NEW
while (l > 0) {
chacha_4block_xor_neon(state, buf, buf, 20,
l > 256 ? 256 : l);
l -= 256;
}
#else
while (l >= 256) {
chacha_4block_xor_neon(state, buf, buf, 20, 256);
l -= 256;
}
while (l >= 64) {
chacha_block_xor_neon(state, buf, buf, 20);
l -= 64;
}
if (l > 0) {
unsigned char tmp[64];

memcpy(tmp, buf, l);
chacha_block_xor_neon(state, tmp, tmp, 20);
memcpy(buf, tmp, l);
}
#endif
}

return 0;
}