[PATCH v3] arm64/crypto: Accelerated CRC T10 DIF computation

From: YueHaibing
Date: Tue Nov 22 2016 - 05:21:59 EST


This is the ARM64 CRC T10 DIF transform accelerated with the ARMv8
NEON instruction.The config CRYPTO_CRCT10DIF_NEON should be turned
on to enable the feature.The crc_t10dif crypto library function will
use this faster algorithm when crct10dif_neon module is loaded.

Tcrypt benchmark results:

HIP06 (mode=320 sec=2)

The ratio of bytes/sec crct10dif-neon Vs. crct10dif-generic:

TEST neon generic ratio
16 byte blocks, 16 bytes per update, 1 updates 214506112 171095400 1.25
64 byte blocks, 16 bytes per update, 4 updates 139385312 119036352 1.17
64 byte blocks, 64 bytes per update, 1 updates 671523712 198945344 3.38
256 byte blocks, 16 bytes per update, 16 updates 157674880 125146752 1.26
256 byte blocks, 64 bytes per update, 4 updates 491888128 175764096 2.80
256 byte blocks, 256 bytes per update, 1 updates 2123298176 206995200 10.26
1024 byte blocks, 16 bytes per update, 64 updates 161243136 126460416 1.28
1024 byte blocks, 256 bytes per update, 4 updates 1643020800 200027136 8.21
1024 byte blocks, 1024 bytes per update, 1 updates 4238239232 209106432 20.27
2048 byte blocks, 16 bytes per update, 128 updates 162079744 126953472 1.28
2048 byte blocks, 256 bytes per update, 8 updates 1693587456 200867840 8.43
2048 byte blocks, 1024 bytes per update, 2 updates 3424323584 206330880 16.60
2048 byte blocks, 2048 bytes per update, 1 updates 5228207104 208620544 25.06
4096 byte blocks, 16 bytes per update, 256 updates 162304000 126894080 1.28
4096 byte blocks, 256 bytes per update, 16 updates 1731862528 201197568 8.61
4096 byte blocks, 1024 bytes per update, 4 updates 3668625408 207003648 17.72
4096 byte blocks, 4096 bytes per update, 1 updates 5551239168 209127424 26.54
8192 byte blocks, 16 bytes per update, 512 updates 162779136 126984192 1.28
8192 byte blocks, 256 bytes per update, 32 updates 1753702400 201420800 8.71
8192 byte blocks, 1024 bytes per update, 8 updates 3760918528 207351808 18.14
8192 byte blocks, 4096 bytes per update, 2 updates 5483655168 208928768 26.25
8192 byte blocks, 8192 bytes per update, 1 updates 5623377920 209108992 26.89

Signed-off-by: YueHaibing <yuehaibing@xxxxxxxxxx>
Signed-off-by: YangShengkai <yangshengkai@xxxxxxxxxx>
Signed-off-by: Ding Tianhong <dingtianhong@xxxxxxxxxx>
Signed-off-by: Hanjun Guo <hanjun.guo@xxxxxxxxxx>

---
arch/arm64/crypto/Kconfig | 5 +
arch/arm64/crypto/Makefile | 4 +
arch/arm64/crypto/crct10dif-neon-asm_64.S | 751 ++++++++++++++++++++++++++++++
arch/arm64/crypto/crct10dif-neon_glue.c | 115 +++++
4 files changed, 875 insertions(+)
create mode 100644 arch/arm64/crypto/crct10dif-neon-asm_64.S
create mode 100644 arch/arm64/crypto/crct10dif-neon_glue.c

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2cf32e9..2e450bf 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_HASH

+config CRYPTO_CRCT10DIF_NEON
+ tristate "CRCT10DIF hardware acceleration using NEON instructions"
+ depends on ARM64 && KERNEL_MODE_NEON
+ select CRYPTO_HASH
+
config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
depends on ARM64 && KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index abb79b3..6c9ff2c 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -29,6 +29,10 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
aes-neon-blk-y := aes-glue-neon.o aes-neon.o

+obj-$(CONFIG_CRYPTO_CRCT10DIF_NEON) += crct10dif-neon.o
+crct10dif-neon-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o
+AFLAGS_crct10dif-neon-asm_64.o := -march=armv8-a+crypto
+
AFLAGS_aes-ce.o := -DINTERLEAVE=4
AFLAGS_aes-neon.o := -DINTERLEAVE=4

diff --git a/arch/arm64/crypto/crct10dif-neon-asm_64.S b/arch/arm64/crypto/crct10dif-neon-asm_64.S
new file mode 100644
index 0000000..2ae3033
--- /dev/null
+++ b/arch/arm64/crypto/crct10dif-neon-asm_64.S
@@ -0,0 +1,751 @@
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.global crc_t10dif_neon
+.text
+
+/* X0 is initial CRC value
+ * X1 is data buffer
+ * X2 is the length of buffer
+ * X3 is the backup buffer(for extend)
+ * X4 for other extend parameter(for extend)
+ * Q0, Q1, Q2, Q3 maybe as parameter for other functions,
+ * the value of Q0, Q1, Q2, Q3 maybe modified.
+ *
+ * suggestion:
+ * 1. dont use general purpose register for calculation
+ * 2. set data endianness outside of the kernel
+ * 3. use ext as shifting around
+ * 4. dont use LD3/LD4, ST3/ST4
+ */
+
+crc_t10dif_neon:
+ /* push the register to stack that CRC16 will use */
+ STP X5, X6, [sp, #-0x10]!
+ STP X7, X8, [sp, #-0x10]!
+ STP X9, X10, [sp, #-0x10]!
+ STP X11, X12, [sp, #-0x10]!
+ STP X13, X14, [sp, #-0x10]!
+ STP Q10, Q11, [sp, #-0x20]!
+ STP Q12, Q13, [sp, #-0x20]!
+ STP Q4, Q5, [sp, #-0x20]!
+ STP Q6, Q7, [sp, #-0x20]!
+ STP Q8, Q9, [sp, #-0x20]!
+ STP Q14, Q15, [sp, #-0x20]!
+ STP Q16, Q17, [sp, #-0x20]!
+ STP Q18, Q19, [sp, #-0x20]!
+
+ SUB sp,sp,#0x20
+
+ MOV X11, #0 // PUSH STACK FLAG
+
+ CMP X2, #0x80
+ B.LT 2f // _less_than_128, <128
+
+ /* V10/V11/V12/V13 is 128bit.
+ * we get data 512bit( by cacheline ) each time
+ */
+ LDP Q10, Q11, [X1], #0x20
+ LDP Q12, Q13, [X1], #0x20
+
+ /* move the initial value to V6 register */
+ LSL X0, X0, #48
+ EOR V6.16B, V6.16B, V6.16B
+ MOV V6.D[1], X0
+
+ /* big-little end change. because the data in memory is little-end,
+ * we deal the data for bigend
+ */
+
+ REV64 V10.16B, V10.16B
+ REV64 V11.16B, V11.16B
+ REV64 V12.16B, V12.16B
+ REV64 V13.16B, V13.16B
+ EXT V10.16B, V10.16B, V10.16B, #8
+ EXT V11.16B, V11.16B, V11.16B, #8
+ EXT V12.16B, V12.16B, V12.16B, #8
+ EXT V13.16B, V13.16B, V13.16B, #8
+
+ EOR V10.16B, V10.16B, V6.16B
+
+ SUB X2, X2, #0x80
+ ADD X5, X1, #0x20
+
+ /* deal data when the size of buffer bigger than 128 bytes */
+ /* _fold_64_B_loop */
+ LDR Q6,=0xe658000000000000044c000000000000
+1:
+
+ LDP Q16, Q17, [X1] ,#0x40
+ LDP Q18, Q19, [X5], #0x40
+
+ /* carry-less multiply.
+ * V10 high-64bits carry-less multiply
+ * V6 high-64bits(PMULL2)
+ * V11 low-64bits carry-less multiply V6 low-64bits(PMULL)
+ */
+
+ PMULL2 V4.1Q, V10.2D, V6.2D
+ PMULL V10.1Q, V10.1D, V6.1D
+ PMULL2 V5.1Q, V11.2D, V6.2D
+ PMULL V11.1Q, V11.1D, V6.1D
+
+ REV64 V16.16B, V16.16B
+ REV64 V17.16B, V17.16B
+ REV64 V18.16B, V18.16B
+ REV64 V19.16B, V19.16B
+
+ PMULL2 V14.1Q, V12.2D, V6.2D
+ PMULL V12.1Q, V12.1D, V6.1D
+ PMULL2 V15.1Q, V13.2D, V6.2D
+ PMULL V13.1Q, V13.1D, V6.1D
+
+ EXT V16.16B, V16.16B, V16.16B, #8
+ EOR V10.16B, V10.16B, V4.16B
+
+ EXT V17.16B, V17.16B, V17.16B, #8
+ EOR V11.16B, V11.16B, V5.16B
+
+ EXT V18.16B, V18.16B, V18.16B, #8
+ EOR V12.16B, V12.16B, V14.16B
+
+ EXT V19.16B, V19.16B, V19.16B, #8
+ EOR V13.16B, V13.16B, V15.16B
+
+ SUB X2, X2, #0x40
+
+
+ EOR V10.16B, V10.16B, V16.16B
+ EOR V11.16B, V11.16B, V17.16B
+
+ EOR V12.16B, V12.16B, V18.16B
+ EOR V13.16B, V13.16B, V19.16B
+
+ CMP X2, #0x0
+ B.GE 1b // >=0
+
+ LDR Q6, =0x06df0000000000002d56000000000000
+ MOV V4.16B, V10.16B
+ /* V10 carry-less 0x06df000000000000([127:64]*[127:64]) */
+ PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order
+ PMULL2 V10.1Q, V10.2D, V6.2D
+ EOR V11.16B, V11.16B, V4.16B
+ EOR V11.16B, V11.16B, V10.16B
+
+ MOV V4.16B, V11.16B
+ PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order
+ PMULL2 V11.1Q, V11.2D, V6.2D
+ EOR V12.16B, V12.16B, V4.16B
+ EOR V12.16B, V12.16B, V11.16B
+
+ MOV V4.16B, V12.16B
+ PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order
+ PMULL2 V12.1Q, V12.2D, V6.2D
+ EOR V13.16B, V13.16B, V4.16B
+ EOR V13.16B, V13.16B, V12.16B
+
+ ADD X2, X2, #48
+ CMP X2, #0x0
+ B.LT 3f // _final_reduction_for_128, <0
+
+ /* _16B_reduction_loop */
+4:
+ /* unrelated load as early as possible*/
+ LDR Q10, [X1], #0x10
+
+ MOV V4.16B, V13.16B
+ PMULL2 V13.1Q, V13.2D, V6.2D
+ PMULL V4.1Q, V4.1D, V6.1D
+ EOR V13.16B, V13.16B, V4.16B
+
+ REV64 V10.16B, V10.16B
+ EXT V10.16B, V10.16B, V10.16B, #8
+
+ EOR V13.16B, V13.16B, V10.16B
+
+ SUB X2, X2, #0x10
+ CMP X2, #0x0
+ B.GE 4b // _16B_reduction_loop, >=0
+
+ /* _final_reduction_for_128 */
+3: ADD X2, X2, #0x10
+ CMP X2, #0x0
+ B.EQ 5f // _128_done, ==0
+
+ /* _get_last_two_xmms */
+6: MOV V12.16B, V13.16B
+ SUB X1, X1, #0x10
+ ADD X1, X1, X2
+ LDR Q11, [X1], #0x10
+ REV64 V11.16B, V11.16B
+ EXT V11.16B, V11.16B, V11.16B, #8
+
+ CMP X2, #8
+ B.EQ 50f
+ B.LT 51f
+ B.GT 52f
+
+50:
+ /* dont use X register as temp one */
+ FMOV D14, D12
+ MOVI D12, #0
+ MOV V12.D[1],V14.D[0]
+ B 53f
+51:
+ MOV X9, #64
+ LSL X13, X2, #3 // <<3 equal x8
+ SUB X9, X9, X13
+ MOV X5, V12.D[0] // low 64-bit
+ MOV X6, V12.D[1] // high 64-bit
+ LSR X10, X5, X9 // high bit of low 64-bit
+ LSL X7, X5, X13
+ LSL X8, X6, X13
+ ORR X8, X8, X10 // combination of high 64-bit
+ MOV V12.D[1], X8
+ MOV V12.D[0], X7
+
+ B 53f
+52:
+ LSL X13, X2, #3 // <<3 equal x8
+ SUB X13, X13, #64
+
+ DUP V18.2D, X13
+ FMOV D16, D12
+ USHL D16, D16, D18
+ EXT V12.16B, V16.16B, V16.16B, #8
+
+53:
+ MOVI D14, #0 //add one zero constant
+
+ CMP X2, #0
+ B.EQ 30f
+ CMP X2, #1
+ B.EQ 31f
+ CMP X2, #2
+ B.EQ 32f
+ CMP X2, #3
+ B.EQ 33f
+ CMP X2, #4
+ B.EQ 34f
+ CMP X2, #5
+ B.EQ 35f
+ CMP X2, #6
+ B.EQ 36f
+ CMP X2, #7
+ B.EQ 37f
+ CMP X2, #8
+ B.EQ 38f
+ CMP X2, #9
+ B.EQ 39f
+ CMP X2, #10
+ B.EQ 40f
+ CMP X2, #11
+ B.EQ 41f
+ CMP X2, #12
+ B.EQ 42f
+ CMP X2, #13
+ B.EQ 43f
+ CMP X2, #14
+ B.EQ 44f
+ CMP X2, #15
+ B.EQ 45f
+
+ // >> 128bit
+30:
+ EOR V13.16B, V13.16B, V13.16B
+ EOR V8.16B, V8.16B, V8.16B
+ LDR Q9,=0xffffffffffffffffffffffffffffffff
+ B 46f
+
+ // >> 120bit
+31:
+ USHR V13.2D, V13.2D, #56
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xff
+ LDR Q9,=0xffffffffffffffffffffffffffffff00
+ B 46f
+
+ // >> 112bit
+32:
+ USHR V13.2D, V13.2D, #48
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffff
+ LDR Q9,=0xffffffffffffffffffffffffffff0000
+ B 46f
+
+ // >> 104bit
+33:
+ USHR V13.2D, V13.2D, #40
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffffff
+ LDR Q9,=0xffffffffffffffffffffffffff000000
+ B 46f
+
+ // >> 96bit
+34:
+ USHR V13.2D, V13.2D, #32
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffffffff
+ LDR Q9,=0xffffffffffffffffffffffff00000000
+ B 46f
+
+ // >> 88bit
+35:
+ USHR V13.2D, V13.2D, #24
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffffffffff
+ LDR Q9,=0xffffffffffffffffffffff0000000000
+ B 46f
+
+ // >> 80bit
+36:
+ USHR V13.2D, V13.2D, #16
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffffffffffff
+ LDR Q9,=0xffffffffffffffffffff000000000000
+ B 46f
+
+ // >> 72bit
+37:
+ USHR V13.2D, V13.2D, #8
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffffffffffffff
+ LDR Q9,=0xffffffffffffffffff00000000000000
+ B 46f
+
+ // >> 64bit
+38:
+ EXT V13.16B, V13.16B, V14.16B, #8
+ LDR Q8,=0xffffffffffffffff
+ LDR Q9,=0xffffffffffffffff0000000000000000
+ B 46f
+
+ // >> 56bit
+39:
+ EXT V13.16B, V13.16B, V13.16B, #7
+ MOV V13.S[3], V14.S[0]
+ MOV V13.H[5], V14.H[0]
+ MOV V13.B[9], V14.B[0]
+
+ LDR Q8,=0xffffffffffffffffff
+ LDR Q9,=0xffffffffffffff000000000000000000
+ B 46f
+
+ // >> 48bit
+40:
+ EXT V13.16B, V13.16B, V13.16B, #6
+ MOV V13.S[3], V14.S[0]
+ MOV V13.H[5], V14.H[0]
+
+ LDR Q8,=0xffffffffffffffffffff
+ LDR Q9,=0xffffffffffff00000000000000000000
+ B 46f
+
+ // >> 40bit
+41:
+ EXT V13.16B, V13.16B, V13.16B, #5
+ MOV V13.S[3], V14.S[0]
+ MOV V13.B[11], V14.B[0]
+
+ LDR Q8,=0xffffffffffffffffffffff
+ LDR Q9,=0xffffffffff0000000000000000000000
+ B 46f
+
+ // >> 32bit
+42:
+ EXT V13.16B, V13.16B, V13.16B, #4
+ MOV V13.S[3], V14.S[0]
+
+ LDR Q8,=0xffffffffffffffffffffffff
+ LDR Q9,=0xffffffff000000000000000000000000
+ B 46f
+
+ // >> 24bit
+43:
+ EXT V13.16B, V13.16B, V13.16B, #3
+ MOV V13.H[7], V14.H[0]
+ MOV V13.B[13], V14.B[0]
+
+ LDR Q8,=0xffffffffffffffffffffffffff
+ LDR Q9,=0xffffff00000000000000000000000000
+ B 46f
+
+ // >> 16bit
+44:
+ EXT V13.16B, V13.16B, V13.16B, #2
+ MOV V13.H[7], V14.H[0]
+
+ LDR Q8,=0xffffffffffffffffffffffffffff
+ LDR Q9,=0xffff0000000000000000000000000000
+ B 46f
+
+ // >> 8bit
+45:
+ EXT V13.16B, V13.16B, V13.16B, #1
+ MOV V13.B[15], V14.B[0]
+
+ LDR Q8,=0xffffffffffffffffffffffffffffff
+ LDR Q9,=0xff000000000000000000000000000000
+
+ // backup V12 first
+ // pblendvb xmm1, xmm2
+46:
+ AND V12.16B, V12.16B, V9.16B
+ AND V11.16B, V11.16B, V8.16B
+ ORR V11.16B, V11.16B, V12.16B
+
+ MOV V12.16B, V11.16B
+ MOV V4.16B, V13.16B
+ PMULL2 V13.1Q, V13.2D, V6.2D
+ PMULL V4.1Q, V4.1D, V6.1D
+ EOR V13.16B, V13.16B, V4.16B
+ EOR V13.16B, V13.16B, V12.16B
+
+ /* _128_done. we change the Q6 D[0] and D[1] */
+5: LDR Q6, =0x2d560000000000001368000000000000
+ MOVI D14, #0
+ MOV V10.16B, V13.16B
+ PMULL2 V13.1Q, V13.2D, V6.2D
+
+ MOV V10.D[1], V10.D[0]
+ MOV V10.D[0], V14.D[0] //set zero
+
+ EOR V13.16B, V13.16B, V10.16B
+
+ MOV V10.16B, V13.16B
+ LDR Q7, =0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+ AND V10.16B, V10.16B, V7.16B
+
+ MOV S13, V13.S[3]
+
+ PMULL V13.1Q, V13.1D, V6.1D
+ EOR V13.16B, V13.16B, V10.16B
+
+ /* _barrett */
+7: LDR Q6, =0x00000001f65a57f8000000018bb70000
+ MOVI D14, #0
+ MOV V10.16B, V13.16B
+ PMULL2 V13.1Q, V13.2D, V6.2D
+
+ EXT V13.16B, V13.16B, V13.16B, #12
+ MOV V13.S[0], V14.S[0]
+
+ EXT V6.16B, V6.16B, V6.16B, #8
+ PMULL2 V13.1Q, V13.2D, V6.2D
+
+ EXT V13.16B, V13.16B, V13.16B, #12
+ MOV V13.S[0], V14.S[0]
+
+ EOR V13.16B, V13.16B, V10.16B
+ MOV X0, V13.D[0]
+
+ /* _cleanup */
+8: MOV X14, #48
+ LSR X0, X0, X14
+99:
+ ADD sp, sp, #0x20
+
+ LDP Q18, Q19, [sp], #0x20
+ LDP Q16, Q17, [sp], #0x20
+ LDP Q14, Q15, [sp], #0x20
+
+ LDP Q8, Q9, [sp], #0x20
+ LDP Q6, Q7, [sp], #0x20
+ LDP Q4, Q5, [sp], #0x20
+ LDP Q12, Q13, [sp], #0x20
+ LDP Q10, Q11, [sp], #0x20
+ LDP X13, X14, [sp], #0x10
+ LDP X11, X12, [sp], #0x10
+ LDP X9, X10, [sp], #0x10
+ LDP X7, X8, [sp], #0x10
+ LDP X5, X6, [sp], #0x10
+
+ RET
+
+ /* _less_than_128 */
+2: CMP X2, #32
+ B.LT 9f // _less_than_32
+ LDR Q6, =0x06df0000000000002d56000000000000
+
+ LSL X0, X0, #48
+ LDR Q10, =0x0
+ MOV V10.D[1], X0
+ LDR Q13, [X1], #0x10
+ REV64 V13.16B, V13.16B
+ EXT V13.16B, V13.16B, V13.16B, #8
+
+ EOR V13.16B, V13.16B, V10.16B
+
+ SUB X2, X2, #32
+ B 4b
+
+ /* _less_than_32 */
+9: CMP X2, #0
+ B.EQ 99b // _cleanup
+ LSL X0, X0, #48
+ LDR Q10,=0x0
+ MOV V10.D[1], X0
+
+ CMP X2, #16
+ B.EQ 10f // _exact_16_left
+ B.LE 11f // _less_than_16_left
+ LDR Q13, [X1], #0x10
+
+ REV64 V13.16B, V13.16B
+ EXT V13.16B, V13.16B, V13.16B, #8
+
+ EOR V13.16B, V13.16B, V10.16B
+ SUB X2, X2, #16
+ LDR Q6, =0x06df0000000000002d56000000000000
+ B 6b // _get_last_two_xmms
+
+ /* _less_than_16_left */
+11: CMP X2, #4
+ B.LT 13f // _only_less_than_4
+
+ /* backup the length of data, we used in _less_than_2_left */
+ MOV X8, X2
+ CMP X2, #8
+ B.LT 14f // _less_than_8_left
+
+ LDR X14, [X1], #8
+ /* push the data to stack, we backup the data to V10 */
+ STR X14, [sp, #0]
+ SUB X2, X2, #8
+ ADD X11, X11, #8
+
+ /* _less_than_8_left */
+14: CMP X2, #4
+ B.LT 15f // _less_than_4_left
+
+ /* get 32bit data */
+ LDR W5, [X1], #4
+
+ /* push the data to stack */
+ STR W5, [sp, X11]
+ SUB X2, X2, #4
+ ADD X11, X11, #4
+
+ /* _less_than_4_left */
+15: CMP X2, #2
+ B.LT 16f // _less_than_2_left
+
+ /* get 16bits data */
+ LDRH W6, [X1], #2
+
+ /* push the data to stack */
+ STRH W6, [sp, X11]
+ SUB X2, X2, #2
+ ADD X11, X11, #2
+
+ /* _less_than_2_left */
+16:
+ /* get 8bits data */
+ LDRB W7, [X1], #1
+ STRB W7, [sp, X11]
+ ADD X11, X11, #1
+
+ /* POP data from stack, store to V13 */
+ LDR Q13, [sp]
+ MOVI D14, #0
+ REV64 V13.16B, V13.16B
+ MOV V8.16B, V13.16B
+ MOV V13.D[1], V8.D[0]
+ MOV V13.D[0], V8.D[1]
+
+ EOR V13.16B, V13.16B, V10.16B
+ CMP X8, #15
+ B.EQ 80f
+ CMP X8, #14
+ B.EQ 81f
+ CMP X8, #13
+ B.EQ 82f
+ CMP X8, #12
+ B.EQ 83f
+ CMP X8, #11
+ B.EQ 84f
+ CMP X8, #10
+ B.EQ 85f
+ CMP X8, #9
+ B.EQ 86f
+ CMP X8, #8
+ B.EQ 87f
+ CMP X8, #7
+ B.EQ 88f
+ CMP X8, #6
+ B.EQ 89f
+ CMP X8, #5
+ B.EQ 90f
+ CMP X8, #4
+ B.EQ 91f
+ CMP X8, #3
+ B.EQ 92f
+ CMP X8, #2
+ B.EQ 93f
+ CMP X8, #1
+ B.EQ 94f
+ CMP X8, #0
+ B.EQ 95f
+
+80:
+ EXT V13.16B, V13.16B, V13.16B, #1
+ MOV V13.B[15], V14.B[0]
+ B 5b
+
+81:
+ EXT V13.16B, V13.16B, V13.16B, #2
+ MOV V13.H[7], V14.H[0]
+ B 5b
+
+82:
+ EXT V13.16B, V13.16B, V13.16B, #3
+ MOV V13.H[7], V14.H[0]
+ MOV V13.B[13], V14.B[0]
+ B 5b
+83:
+
+ EXT V13.16B, V13.16B, V13.16B, #4
+ MOV V13.S[3], V14.S[0]
+ B 5b
+
+84:
+ EXT V13.16B, V13.16B, V13.16B, #5
+ MOV V13.S[3], V14.S[0]
+ MOV V13.B[11], V14.B[0]
+ B 5b
+
+85:
+ EXT V13.16B, V13.16B, V13.16B, #6
+ MOV V13.S[3], V14.S[0]
+ MOV V13.H[5], V14.H[0]
+ B 5b
+
+86:
+ EXT V13.16B, V13.16B, V13.16B, #7
+ MOV V13.S[3], V14.S[0]
+ MOV V13.H[5], V14.H[0]
+ MOV V13.B[9], V14.B[0]
+ B 5b
+
+87:
+ MOV V13.D[0], V13.D[1]
+ MOV V13.D[1], V14.D[0]
+ B 5b
+
+88:
+ EXT V13.16B, V13.16B, V13.16B, #9
+ MOV V13.D[1], V14.D[0]
+ MOV V13.B[7], V14.B[0]
+ B 5b
+
+89:
+ EXT V13.16B, V13.16B, V13.16B, #10
+ MOV V13.D[1], V14.D[0]
+ MOV V13.H[3], V14.H[0]
+ B 5b
+
+90:
+ EXT V13.16B, V13.16B, V13.16B, #11
+ MOV V13.D[1], V14.D[0]
+ MOV V13.H[3], V14.H[0]
+ MOV V13.B[5], V14.B[0]
+ B 5b
+
+91:
+ MOV V13.S[0], V13.S[3]
+ MOV V13.D[1], V14.D[0]
+ MOV V13.S[1], V14.S[0]
+ B 5b
+
+92:
+ EXT V13.16B, V13.16B, V13.16B, #13
+ MOV V13.D[1], V14.D[0]
+ MOV V13.S[1], V14.S[0]
+ MOV V13.B[3], V14.B[0]
+ B 5b
+
+93:
+ MOV V15.H[0], V13.H[7]
+ MOV V13.16B, V14.16B
+ MOV V13.H[0], V15.H[0]
+ B 5b
+
+94:
+ MOV V15.B[0], V13.B[15]
+ MOV V13.16B, V14.16B
+ MOV V13.B[0], V15.B[0]
+ B 5b
+
+95:
+ LDR Q13,=0x0
+ B 5b // _128_done
+
+ /* _exact_16_left */
+10:
+ LD1 { V13.2D }, [X1], #0x10
+
+ REV64 V13.16B, V13.16B
+ EXT V13.16B, V13.16B, V13.16B, #8
+ EOR V13.16B, V13.16B, V10.16B
+ B 5b // _128_done
+
+ /* _only_less_than_4 */
+13: CMP X2, #3
+ MOVI D14, #0
+ B.LT 17f //_only_less_than_3
+
+ LDR S13, [X1], #4
+ MOV V13.B[15], V13.B[0]
+ MOV V13.B[14], V13.B[1]
+ MOV V13.B[13], V13.B[2]
+ MOV V13.S[0], V13.S[1]
+
+ EOR V13.16B, V13.16B, V10.16B
+
+ EXT V13.16B, V13.16B, V13.16B, #5
+
+ MOV V13.S[3], V14.S[0]
+ MOV V13.B[11], V14.B[0]
+
+ B 7b // _barrett
+ /* _only_less_than_3 */
+17:
+ CMP X2, #2
+ B.LT 18f // _only_less_than_2
+
+ LDR H13, [X1], #2
+ MOV V13.B[15], V13.B[0]
+ MOV V13.B[14], V13.B[1]
+ MOV V13.H[0], V13.H[1]
+
+ EOR V13.16B, V13.16B, V10.16B
+
+ EXT V13.16B, V13.16B, V13.16B, #6
+ MOV V13.S[3], V14.S[0]
+ MOV V13.H[5], V14.H[0]
+
+ B 7b // _barrett
+
+ /* _only_less_than_2 */
+18:
+ LDRB W7, [X1], #1
+ LDR Q13, = 0x0
+ MOV V13.B[15], W7
+
+ EOR V13.16B, V13.16B, V10.16B
+
+ EXT V13.16B, V13.16B, V13.16B, #7
+ MOV V13.S[3], V14.S[0]
+ MOV V13.H[5], V14.H[0]
+ MOV V13.B[9], V14.B[0]
+
+ B 7b // _barrett
diff --git a/arch/arm64/crypto/crct10dif-neon_glue.c b/arch/arm64/crypto/crct10dif-neon_glue.c
new file mode 100644
index 0000000..a6d8c5c
--- /dev/null
+++ b/arch/arm64/crypto/crct10dif-neon_glue.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+asmlinkage __u16 crc_t10dif_neon(__u16 crc, const unsigned char *buf,
+ size_t len);
+
+struct chksum_desc_ctx {
+ __u16 crc;
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = 0;
+
+ return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = crc_t10dif_neon(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__u16 *)out = ctx->crc;
+ return 0;
+}
+
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__u16 *)out = crc_t10dif_neon(*crcp, data, len);
+ return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(&ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(&ctx->crc, data, length, out);
+}
+
+static struct shash_alg alg = {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+ .base = {
+ .cra_name = "crct10dif",
+ .cra_driver_name = "crct10dif-neon",
+ .cra_priority = 200,
+ .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init crct10dif_arm64_mod_init(void)
+{
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_arm64_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_arm64_mod_init);
+module_exit(crct10dif_arm64_mod_fini);
+
+MODULE_AUTHOR("YueHaibing <yuehaibing@xxxxxxxxxx>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with ARM64 NEON instruction.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-neon");
--
2.7.0