[PATCH v4 3/6] An accelerated AES/GCM stitched implementation.

From: Danny Tsen
Date: Mon Feb 20 2023 - 15:42:53 EST


Improve overall performance of AES/GCM encrypt and decrypt operations
for Power10 or later CPU.

Signed-off-by: Danny Tsen <dtsen@xxxxxxxxxxxxx>
---
arch/powerpc/crypto/aes-gcm-p10.S | 1521 +++++++++++++++++++++++++++++
1 file changed, 1521 insertions(+)
create mode 100644 arch/powerpc/crypto/aes-gcm-p10.S

diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S
new file mode 100644
index 000000000000..a51f4b265308
--- /dev/null
+++ b/arch/powerpc/crypto/aes-gcm-p10.S
@@ -0,0 +1,1521 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+ #
+ # Accelerated AES-GCM stitched implementation for ppc64le.
+ #
+ # Copyright 2022- IBM Inc. All rights reserved
+ #
+ #===================================================================================
+ # Written by Danny Tsen <dtsen@xxxxxxxxxxxxx>
+ #
+ # GHASH is based on the Karatsuba multiplication method.
+ #
+ # Xi xor X1
+ #
+ # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+ # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+ # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+ # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+ # (X4.h * H.h + X4.l * H.l + X4 * H)
+ #
+ # Xi = v0
+ # H Poly = v2
+ # Hash keys = v3 - v14
+ # ( H.l, H, H.h)
+ # ( H^2.l, H^2, H^2.h)
+ # ( H^3.l, H^3, H^3.h)
+ # ( H^4.l, H^4, H^4.h)
+ #
+ # v30 is IV
+ # v31 - counter 1
+ #
+ # AES used,
+ # vs0 - vs14 for round keys
+ # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+ #
+ # This implementation uses stitched AES-GCM approach to improve overall performance.
+ # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+ #
+ # ===================================================================================
+ #
+
+#include <asm/ppc_asm.h>
+#include <linux/linkage.h>
+
+.machine "any"
+.text
+
+ # 4x loops
+ # v15 - v18 - input states
+ # vs1 - vs9 - round keys
+ #
+.macro Loop_aes_middle4x
+ xxlor 19+32, 1, 1
+ xxlor 20+32, 2, 2
+ xxlor 21+32, 3, 3
+ xxlor 22+32, 4, 4
+
+ vcipher 15, 15, 19
+ vcipher 16, 16, 19
+ vcipher 17, 17, 19
+ vcipher 18, 18, 19
+
+ vcipher 15, 15, 20
+ vcipher 16, 16, 20
+ vcipher 17, 17, 20
+ vcipher 18, 18, 20
+
+ vcipher 15, 15, 21
+ vcipher 16, 16, 21
+ vcipher 17, 17, 21
+ vcipher 18, 18, 21
+
+ vcipher 15, 15, 22
+ vcipher 16, 16, 22
+ vcipher 17, 17, 22
+ vcipher 18, 18, 22
+
+ xxlor 19+32, 5, 5
+ xxlor 20+32, 6, 6
+ xxlor 21+32, 7, 7
+ xxlor 22+32, 8, 8
+
+ vcipher 15, 15, 19
+ vcipher 16, 16, 19
+ vcipher 17, 17, 19
+ vcipher 18, 18, 19
+
+ vcipher 15, 15, 20
+ vcipher 16, 16, 20
+ vcipher 17, 17, 20
+ vcipher 18, 18, 20
+
+ vcipher 15, 15, 21
+ vcipher 16, 16, 21
+ vcipher 17, 17, 21
+ vcipher 18, 18, 21
+
+ vcipher 15, 15, 22
+ vcipher 16, 16, 22
+ vcipher 17, 17, 22
+ vcipher 18, 18, 22
+
+ xxlor 23+32, 9, 9
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+.endm
+
+ # 8x loops
+ # v15 - v22 - input states
+ # vs1 - vs9 - round keys
+ #
+.macro Loop_aes_middle8x
+ xxlor 23+32, 1, 1
+ xxlor 24+32, 2, 2
+ xxlor 25+32, 3, 3
+ xxlor 26+32, 4, 4
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ vcipher 15, 15, 25
+ vcipher 16, 16, 25
+ vcipher 17, 17, 25
+ vcipher 18, 18, 25
+ vcipher 19, 19, 25
+ vcipher 20, 20, 25
+ vcipher 21, 21, 25
+ vcipher 22, 22, 25
+
+ vcipher 15, 15, 26
+ vcipher 16, 16, 26
+ vcipher 17, 17, 26
+ vcipher 18, 18, 26
+ vcipher 19, 19, 26
+ vcipher 20, 20, 26
+ vcipher 21, 21, 26
+ vcipher 22, 22, 26
+
+ xxlor 23+32, 5, 5
+ xxlor 24+32, 6, 6
+ xxlor 25+32, 7, 7
+ xxlor 26+32, 8, 8
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ vcipher 15, 15, 25
+ vcipher 16, 16, 25
+ vcipher 17, 17, 25
+ vcipher 18, 18, 25
+ vcipher 19, 19, 25
+ vcipher 20, 20, 25
+ vcipher 21, 21, 25
+ vcipher 22, 22, 25
+
+ vcipher 15, 15, 26
+ vcipher 16, 16, 26
+ vcipher 17, 17, 26
+ vcipher 18, 18, 26
+ vcipher 19, 19, 26
+ vcipher 20, 20, 26
+ vcipher 21, 21, 26
+ vcipher 22, 22, 26
+
+ xxlor 23+32, 9, 9
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+.endm
+
+.macro Loop_aes_middle_1x
+ xxlor 19+32, 1, 1
+ xxlor 20+32, 2, 2
+ xxlor 21+32, 3, 3
+ xxlor 22+32, 4, 4
+
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 22
+
+ xxlor 19+32, 5, 5
+ xxlor 20+32, 6, 6
+ xxlor 21+32, 7, 7
+ xxlor 22+32, 8, 8
+
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 22
+
+ xxlor 19+32, 9, 9
+ vcipher 15, 15, 19
+.endm
+
+ #
+ # Compute 4x hash values based on Karatsuba method.
+ #
+.macro ppc_aes_gcm_ghash
+ vxor 15, 15, 0
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 26, 7, 17
+ vpmsumd 27, 4, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27 # M
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 29, 29, 29
+ vsldoi 26, 24, 29, 8 # mL
+ vsldoi 29, 29, 24, 8 # mH
+ vxor 23, 23, 26 # mL + L
+
+ vsldoi 23, 23, 23, 8 # swap
+ vxor 23, 23, 28
+
+ vpmsumd 24, 14, 15 # H4.H * X.H
+ vpmsumd 25, 11, 16
+ vpmsumd 26, 8, 17
+ vpmsumd 27, 5, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27
+
+ vxor 24, 24, 29
+
+ # sum hash and reduction with H Poly
+ vsldoi 27, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 27, 24
+ vxor 23, 23, 27
+
+ xxlor 32, 23+32, 23+32 # update hash
+
+.endm
+
+ #
+ # Combine two 4x ghash
+ # v15 - v22 - input blocks
+ #
+.macro ppc_aes_gcm_ghash2_4x
+ # first 4x hash
+ vxor 15, 15, 0 # Xi + X
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 26, 7, 17
+ vpmsumd 27, 4, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 29, 29, 29
+
+ vxor 24, 24, 27 # M
+ vsldoi 26, 24, 29, 8 # mL
+ vsldoi 29, 29, 24, 8 # mH
+ vxor 23, 23, 26 # mL + L
+
+ vsldoi 23, 23, 23, 8 # swap
+ vxor 23, 23, 28
+
+ vpmsumd 24, 14, 15 # H4.H * X.H
+ vpmsumd 25, 11, 16
+ vpmsumd 26, 8, 17
+ vpmsumd 27, 5, 18
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27 # H
+
+ vxor 24, 24, 29 # H + mH
+
+ # sum hash and reduction with H Poly
+ vsldoi 27, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 27, 24
+ vxor 27, 23, 27 # 1st Xi
+
+ # 2nd 4x hash
+ vpmsumd 24, 9, 20
+ vpmsumd 25, 6, 21
+ vpmsumd 26, 3, 22
+ vxor 19, 19, 27 # Xi + X
+ vpmsumd 23, 12, 19 # H4.L * X.L
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
+ vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 26, 7, 21
+ vpmsumd 27, 4, 22
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 29, 29, 29
+
+ vxor 24, 24, 27 # M
+ vsldoi 26, 24, 29, 8 # mL
+ vsldoi 29, 29, 24, 8 # mH
+ vxor 23, 23, 26 # mL + L
+
+ vsldoi 23, 23, 23, 8 # swap
+ vxor 23, 23, 28
+
+ vpmsumd 24, 14, 19 # H4.H * X.H
+ vpmsumd 25, 11, 20
+ vpmsumd 26, 8, 21
+ vpmsumd 27, 5, 22
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26
+ vxor 24, 24, 27 # H
+
+ vxor 24, 24, 29 # H + mH
+
+ # sum hash and reduction with H Poly
+ vsldoi 27, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 27, 24
+ vxor 23, 23, 27
+
+ xxlor 32, 23+32, 23+32 # update hash
+
+.endm
+
+ #
+ # Compute update single hash
+ #
+.macro ppc_update_hash_1x
+ vxor 28, 28, 0
+
+ vxor 19, 19, 19
+
+ vpmsumd 22, 3, 28 # L
+ vpmsumd 23, 4, 28 # M
+ vpmsumd 24, 5, 28 # H
+
+ vpmsumd 27, 22, 2 # reduction
+
+ vsldoi 25, 23, 19, 8 # mL
+ vsldoi 26, 19, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
+
+ vsldoi 22, 22, 22, 8 # swap
+ vxor 22, 22, 27
+
+ vsldoi 20, 22, 22, 8 # swap
+ vpmsumd 22, 22, 2 # reduction
+ vxor 20, 20, 24
+ vxor 22, 22, 20
+
+ vmr 0, 22 # update hash
+
+.endm
+
+.macro SAVE_REGS
+ stdu 1,-640(1)
+ mflr 0
+
+ std 14,112(1)
+ std 15,120(1)
+ std 16,128(1)
+ std 17,136(1)
+ std 18,144(1)
+ std 19,152(1)
+ std 20,160(1)
+ std 21,168(1)
+ li 9, 256
+ stvx 20, 9, 1
+ addi 9, 9, 16
+ stvx 21, 9, 1
+ addi 9, 9, 16
+ stvx 22, 9, 1
+ addi 9, 9, 16
+ stvx 23, 9, 1
+ addi 9, 9, 16
+ stvx 24, 9, 1
+ addi 9, 9, 16
+ stvx 25, 9, 1
+ addi 9, 9, 16
+ stvx 26, 9, 1
+ addi 9, 9, 16
+ stvx 27, 9, 1
+ addi 9, 9, 16
+ stvx 28, 9, 1
+ addi 9, 9, 16
+ stvx 29, 9, 1
+ addi 9, 9, 16
+ stvx 30, 9, 1
+ addi 9, 9, 16
+ stvx 31, 9, 1
+ stxv 14, 464(1)
+ stxv 15, 480(1)
+ stxv 16, 496(1)
+ stxv 17, 512(1)
+ stxv 18, 528(1)
+ stxv 19, 544(1)
+ stxv 20, 560(1)
+ stxv 21, 576(1)
+ stxv 22, 592(1)
+ std 0, 656(1)
+.endm
+
+.macro RESTORE_REGS
+ lxv 14, 464(1)
+ lxv 15, 480(1)
+ lxv 16, 496(1)
+ lxv 17, 512(1)
+ lxv 18, 528(1)
+ lxv 19, 544(1)
+ lxv 20, 560(1)
+ lxv 21, 576(1)
+ lxv 22, 592(1)
+ li 9, 256
+ lvx 20, 9, 1
+ addi 9, 9, 16
+ lvx 21, 9, 1
+ addi 9, 9, 16
+ lvx 22, 9, 1
+ addi 9, 9, 16
+ lvx 23, 9, 1
+ addi 9, 9, 16
+ lvx 24, 9, 1
+ addi 9, 9, 16
+ lvx 25, 9, 1
+ addi 9, 9, 16
+ lvx 26, 9, 1
+ addi 9, 9, 16
+ lvx 27, 9, 1
+ addi 9, 9, 16
+ lvx 28, 9, 1
+ addi 9, 9, 16
+ lvx 29, 9, 1
+ addi 9, 9, 16
+ lvx 30, 9, 1
+ addi 9, 9, 16
+ lvx 31, 9, 1
+
+ ld 0, 656(1)
+ ld 14,112(1)
+ ld 15,120(1)
+ ld 16,128(1)
+ ld 17,136(1)
+ ld 18,144(1)
+ ld 19,152(1)
+ ld 20,160(1)
+ ld 21,168(1)
+
+ mtlr 0
+ addi 1, 1, 640
+.endm
+
+.macro LOAD_HASH_TABLE
+ # Load Xi
+ lxvb16x 32, 0, 8 # load Xi
+
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 32
+ lxvd2x 2+32, 10, 8 # H Poli
+ li 10, 48
+ lxvd2x 3+32, 10, 8 # Hl
+ li 10, 64
+ lxvd2x 4+32, 10, 8 # H
+ li 10, 80
+ lxvd2x 5+32, 10, 8 # Hh
+
+ li 10, 96
+ lxvd2x 6+32, 10, 8 # H^2l
+ li 10, 112
+ lxvd2x 7+32, 10, 8 # H^2
+ li 10, 128
+ lxvd2x 8+32, 10, 8 # H^2h
+
+ li 10, 144
+ lxvd2x 9+32, 10, 8 # H^3l
+ li 10, 160
+ lxvd2x 10+32, 10, 8 # H^3
+ li 10, 176
+ lxvd2x 11+32, 10, 8 # H^3h
+
+ li 10, 192
+ lxvd2x 12+32, 10, 8 # H^4l
+ li 10, 208
+ lxvd2x 13+32, 10, 8 # H^4
+ li 10, 224
+ lxvd2x 14+32, 10, 8 # H^4h
+.endm
+
+ #
+ # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
+ # const char *rk, unsigned char iv[16], void *Xip);
+ #
+ # r3 - inp
+ # r4 - out
+ # r5 - len
+ # r6 - AES round keys
+ # r7 - iv and other data
+ # r8 - Xi, HPoli, hash keys
+ #
+ # rounds is at offset 240 in rk
+ # Xi is at 0 in gcm_table (Xip).
+ #
+_GLOBAL(aes_p10_gcm_encrypt)
+.align 5
+
+ SAVE_REGS
+
+ LOAD_HASH_TABLE
+
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
+
+ mr 12, 5 # length
+ li 11, 0 # block index
+
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ # load round key to VSR
+ lxv 0, 0(6)
+ lxv 1, 0x10(6)
+ lxv 2, 0x20(6)
+ lxv 3, 0x30(6)
+ lxv 4, 0x40(6)
+ lxv 5, 0x50(6)
+ lxv 6, 0x60(6)
+ lxv 7, 0x70(6)
+ lxv 8, 0x80(6)
+ lxv 9, 0x90(6)
+ lxv 10, 0xa0(6)
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 9,240(6)
+
+ #
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 30, 29 # IV + round key - add round key 0
+
+ cmpdi 9, 10
+ beq Loop_aes_gcm_8x
+
+ # load 2 more round keys (v11, v12)
+ lxv 11, 0xb0(6)
+ lxv 12, 0xc0(6)
+
+ cmpdi 9, 12
+ beq Loop_aes_gcm_8x
+
+ # load 2 more round keys (v11, v12, v13, v14)
+ lxv 13, 0xd0(6)
+ lxv 14, 0xe0(6)
+ cmpdi 9, 14
+ beq Loop_aes_gcm_8x
+
+ b aes_gcm_out
+
+.align 5
+Loop_aes_gcm_8x:
+ mr 14, 3
+ mr 9, 4
+
+ #
+ # check partial block
+ #
+Continue_partial_check:
+ ld 15, 56(7)
+ cmpdi 15, 0
+ beq Continue
+ bgt Final_block
+ cmpdi 15, 16
+ blt Final_block
+
+Continue:
+ # n blcoks
+ li 10, 128
+ divdu 10, 12, 10 # n 128 bytes-blocks
+ cmpdi 10, 0
+ beq Loop_last_block
+
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 16, 30, 29
+ vaddudm 30, 30, 31
+ vxor 17, 30, 29
+ vaddudm 30, 30, 31
+ vxor 18, 30, 29
+ vaddudm 30, 30, 31
+ vxor 19, 30, 29
+ vaddudm 30, 30, 31
+ vxor 20, 30, 29
+ vaddudm 30, 30, 31
+ vxor 21, 30, 29
+ vaddudm 30, 30, 31
+ vxor 22, 30, 29
+
+ mtctr 10
+
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
+
+ lwz 10, 240(6)
+
+Loop_8x_block:
+
+ lxvb16x 15, 0, 14 # load block
+ lxvb16x 16, 15, 14 # load block
+ lxvb16x 17, 16, 14 # load block
+ lxvb16x 18, 17, 14 # load block
+ lxvb16x 19, 18, 14 # load block
+ lxvb16x 20, 19, 14 # load block
+ lxvb16x 21, 20, 14 # load block
+ lxvb16x 22, 21, 14 # load block
+ addi 14, 14, 128
+
+ Loop_aes_middle8x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_ghash
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_ghash
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_ghash
+ b aes_gcm_out
+
+Do_next_ghash:
+
+ #
+ # last round
+ vcipherlast 15, 15, 23
+ vcipherlast 16, 16, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ xxlxor 48, 48, 16
+ stxvb16x 48, 15, 9 # store output
+
+ vcipherlast 17, 17, 23
+ vcipherlast 18, 18, 23
+
+ xxlxor 49, 49, 17
+ stxvb16x 49, 16, 9 # store output
+ xxlxor 50, 50, 18
+ stxvb16x 50, 17, 9 # store output
+
+ vcipherlast 19, 19, 23
+ vcipherlast 20, 20, 23
+
+ xxlxor 51, 51, 19
+ stxvb16x 51, 18, 9 # store output
+ xxlxor 52, 52, 20
+ stxvb16x 52, 19, 9 # store output
+
+ vcipherlast 21, 21, 23
+ vcipherlast 22, 22, 23
+
+ xxlxor 53, 53, 21
+ stxvb16x 53, 20, 9 # store output
+ xxlxor 54, 54, 22
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ # ghash here
+ ppc_aes_gcm_ghash2_4x
+
+ xxlor 27+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vmr 29, 30
+ vxor 15, 30, 27 # add round key
+ vaddudm 30, 30, 31
+ vxor 16, 30, 27
+ vaddudm 30, 30, 31
+ vxor 17, 30, 27
+ vaddudm 30, 30, 31
+ vxor 18, 30, 27
+ vaddudm 30, 30, 31
+ vxor 19, 30, 27
+ vaddudm 30, 30, 31
+ vxor 20, 30, 27
+ vaddudm 30, 30, 31
+ vxor 21, 30, 27
+ vaddudm 30, 30, 31
+ vxor 22, 30, 27
+
+ addi 12, 12, -128
+ addi 11, 11, 128
+
+ bdnz Loop_8x_block
+
+ vmr 30, 29
+ stxvb16x 30+32, 0, 7 # update IV
+
+Loop_last_block:
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+ # loop last few blocks
+ li 10, 16
+ divdu 10, 12, 10
+
+ mtctr 10
+
+ lwz 10, 240(6)
+
+ cmpdi 12, 16
+ blt Final_block
+
+Next_rem_block:
+ lxvb16x 15, 0, 14 # load block
+
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_1x
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_1x
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_1x
+
+Do_next_1x:
+ vcipherlast 15, 15, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
+
+ vmr 28, 15
+ ppc_update_hash_1x
+
+ addi 12, 12, -16
+ addi 11, 11, 16
+ xxlor 19+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 15, 30, 19 # add round key
+
+ bdnz Next_rem_block
+
+ li 15, 0
+ std 15, 56(7) # clear partial?
+ stxvb16x 30+32, 0, 7 # update IV
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+Final_block:
+ lwz 10, 240(6)
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_final_1x
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_final_1x
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_final_1x
+
+Do_final_1x:
+ vcipherlast 15, 15, 23
+
+ # check partial block
+ li 21, 0 # encrypt
+ ld 15, 56(7) # partial?
+ cmpdi 15, 0
+ beq Normal_block
+ bl Do_partial_block
+
+ cmpdi 12, 0
+ ble aes_gcm_out
+
+ b Continue_partial_check
+
+Normal_block:
+ lxvb16x 15, 0, 14 # load last block
+ xxlxor 47, 47, 15
+
+ # create partial block mask
+ li 15, 16
+ sub 15, 15, 12 # index to the mask
+
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
+ li 10, 192
+ stvx 16, 10, 1
+ addi 10, 10, 16
+ stvx 17, 10, 1
+
+ addi 10, 1, 192
+ lxvb16x 16, 15, 10 # load partial block mask
+ xxland 47, 47, 16
+
+ vmr 28, 15
+ ppc_update_hash_1x
+
+ # * should store only the remaining bytes.
+ bl Write_partial_block
+
+ stxvb16x 30+32, 0, 7 # update IV
+ std 12, 56(7) # update partial?
+ li 16, 16
+
+ stxvb16x 32, 0, 8 # write out Xi
+ stxvb16x 32, 16, 8 # write out Xi
+ b aes_gcm_out
+
+ #
+ # Compute data mask
+ #
+.macro GEN_MASK _mask _start _end
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
+ li 10, 192
+ stxvb16x 17+32, 10, 1
+ add 10, 10, \_start
+ stxvb16x 16+32, 10, 1
+ add 10, 10, \_end
+ stxvb16x 17+32, 10, 1
+
+ addi 10, 1, 192
+ lxvb16x \_mask, 0, 10 # load partial block mask
+.endm
+
+ #
+ # Handle multiple partial blocks for encrypt and decrypt
+ # operations.
+ #
+SYM_FUNC_START_LOCAL(Do_partial_block)
+ add 17, 15, 5
+ cmpdi 17, 16
+ bgt Big_block
+ GEN_MASK 18, 15, 5
+ b _Partial
+SYM_FUNC_END(Do_partial_block)
+Big_block:
+ li 16, 16
+ GEN_MASK 18, 15, 16
+
+_Partial:
+ lxvb16x 17+32, 0, 14 # load last block
+ sldi 16, 15, 3
+ mtvsrdd 32+16, 0, 16
+ vsro 17, 17, 16
+ xxlxor 47, 47, 17+32
+ xxland 47, 47, 18
+
+ vxor 0, 0, 0 # clear Xi
+ vmr 28, 15
+
+ cmpdi 21, 0 # encrypt/decrypt ops?
+ beq Skip_decrypt
+ xxland 32+28, 32+17, 18
+
+Skip_decrypt:
+
+ ppc_update_hash_1x
+
+ li 16, 16
+ lxvb16x 32+29, 16, 8
+ vxor 0, 0, 29
+ stxvb16x 32, 0, 8 # save Xi
+ stxvb16x 32, 16, 8 # save Xi
+
+ # store partial block
+ # loop the rest of the stream if any
+ sldi 16, 15, 3
+ mtvsrdd 32+16, 0, 16
+ vslo 15, 15, 16
+ #stxvb16x 15+32, 0, 9 # last block
+
+ li 16, 16
+ sub 17, 16, 15 # 16 - partial
+
+ add 16, 15, 5
+ cmpdi 16, 16
+ bgt Larger_16
+ mr 17, 5
+Larger_16:
+
+ # write partial
+ li 10, 192
+ stxvb16x 15+32, 10, 1 # save current block
+
+ addi 10, 9, -1
+ addi 16, 1, 191
+ mtctr 17 # move partial byte count
+
+Write_last_partial:
+ lbzu 18, 1(16)
+ stbu 18, 1(10)
+ bdnz Write_last_partial
+ # Complete loop partial
+
+ add 14, 14, 17
+ add 9, 9, 17
+ sub 12, 12, 17
+ add 11, 11, 17
+
+ add 15, 15, 5
+ cmpdi 15, 16
+ blt Save_partial
+
+ vaddudm 30, 30, 31
+ stxvb16x 30+32, 0, 7 # update IV
+ xxlor 32+29, 0, 0
+ vxor 15, 30, 29 # IV + round key - add round key 0
+ li 15, 0
+ std 15, 56(7) # partial done - clear
+ b Partial_done
+Save_partial:
+ std 15, 56(7) # partial
+
+Partial_done:
+ blr
+
+ #
+ # Write partial block
+ # r9 - output
+ # r12 - remaining bytes
+ # v15 - partial input data
+ #
+SYM_FUNC_START_LOCAL(Write_partial_block)
+ li 10, 192
+ stxvb16x 15+32, 10, 1 # last block
+
+ addi 10, 9, -1
+ addi 16, 1, 191
+
+ mtctr 12 # remaining bytes
+ li 15, 0
+
+Write_last_byte:
+ lbzu 14, 1(16)
+ stbu 14, 1(10)
+ bdnz Write_last_byte
+ blr
+SYM_FUNC_END(Write_partial_block)
+
+aes_gcm_out:
+ # out = state
+ stxvb16x 32, 0, 8 # write out Xi
+ add 3, 11, 12 # return count
+
+ RESTORE_REGS
+ blr
+
+ #
+ # 8x Decrypt
+ #
+_GLOBAL(aes_p10_gcm_decrypt)
+.align 5
+
+ SAVE_REGS
+
+ LOAD_HASH_TABLE
+
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
+
+ mr 12, 5 # length
+ li 11, 0 # block index
+
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ # load round key to VSR
+ lxv 0, 0(6)
+ lxv 1, 0x10(6)
+ lxv 2, 0x20(6)
+ lxv 3, 0x30(6)
+ lxv 4, 0x40(6)
+ lxv 5, 0x50(6)
+ lxv 6, 0x60(6)
+ lxv 7, 0x70(6)
+ lxv 8, 0x80(6)
+ lxv 9, 0x90(6)
+ lxv 10, 0xa0(6)
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 9,240(6)
+
+ #
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 30, 29 # IV + round key - add round key 0
+
+ cmpdi 9, 10
+ beq Loop_aes_gcm_8x_dec
+
+ # load 2 more round keys (v11, v12)
+ lxv 11, 0xb0(6)
+ lxv 12, 0xc0(6)
+
+ cmpdi 9, 12
+ beq Loop_aes_gcm_8x_dec
+
+ # load 2 more round keys (v11, v12, v13, v14)
+ lxv 13, 0xd0(6)
+ lxv 14, 0xe0(6)
+ cmpdi 9, 14
+ beq Loop_aes_gcm_8x_dec
+
+ b aes_gcm_out
+
+.align 5
+Loop_aes_gcm_8x_dec:
+ mr 14, 3
+ mr 9, 4
+
+ #
+ # check partial block
+ #
+Continue_partial_check_dec:
+ ld 15, 56(7)
+ cmpdi 15, 0
+ beq Continue_dec
+ bgt Final_block_dec
+ cmpdi 15, 16
+ blt Final_block_dec
+
+Continue_dec:
+ # n blcoks
+ li 10, 128
+ divdu 10, 12, 10 # n 128 bytes-blocks
+ cmpdi 10, 0
+ beq Loop_last_block_dec
+
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 16, 30, 29
+ vaddudm 30, 30, 31
+ vxor 17, 30, 29
+ vaddudm 30, 30, 31
+ vxor 18, 30, 29
+ vaddudm 30, 30, 31
+ vxor 19, 30, 29
+ vaddudm 30, 30, 31
+ vxor 20, 30, 29
+ vaddudm 30, 30, 31
+ vxor 21, 30, 29
+ vaddudm 30, 30, 31
+ vxor 22, 30, 29
+
+ mtctr 10
+
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
+
+ lwz 10, 240(6)
+
+Loop_8x_block_dec:
+
+ lxvb16x 15, 0, 14 # load block
+ lxvb16x 16, 15, 14 # load block
+ lxvb16x 17, 16, 14 # load block
+ lxvb16x 18, 17, 14 # load block
+ lxvb16x 19, 18, 14 # load block
+ lxvb16x 20, 19, 14 # load block
+ lxvb16x 21, 20, 14 # load block
+ lxvb16x 22, 21, 14 # load block
+ addi 14, 14, 128
+
+ Loop_aes_middle8x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_ghash_dec
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_ghash_dec
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 16, 16, 23
+ vcipher 17, 17, 23
+ vcipher 18, 18, 23
+ vcipher 19, 19, 23
+ vcipher 20, 20, 23
+ vcipher 21, 21, 23
+ vcipher 22, 22, 23
+
+ vcipher 15, 15, 24
+ vcipher 16, 16, 24
+ vcipher 17, 17, 24
+ vcipher 18, 18, 24
+ vcipher 19, 19, 24
+ vcipher 20, 20, 24
+ vcipher 21, 21, 24
+ vcipher 22, 22, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_ghash_dec
+ b aes_gcm_out
+
+Do_next_ghash_dec:
+
+ #
+ # last round
+ vcipherlast 15, 15, 23
+ vcipherlast 16, 16, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ xxlxor 48, 48, 16
+ stxvb16x 48, 15, 9 # store output
+
+ vcipherlast 17, 17, 23
+ vcipherlast 18, 18, 23
+
+ xxlxor 49, 49, 17
+ stxvb16x 49, 16, 9 # store output
+ xxlxor 50, 50, 18
+ stxvb16x 50, 17, 9 # store output
+
+ vcipherlast 19, 19, 23
+ vcipherlast 20, 20, 23
+
+ xxlxor 51, 51, 19
+ stxvb16x 51, 18, 9 # store output
+ xxlxor 52, 52, 20
+ stxvb16x 52, 19, 9 # store output
+
+ vcipherlast 21, 21, 23
+ vcipherlast 22, 22, 23
+
+ xxlxor 53, 53, 21
+ stxvb16x 53, 20, 9 # store output
+ xxlxor 54, 54, 22
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ xxlor 15+32, 15, 15
+ xxlor 16+32, 16, 16
+ xxlor 17+32, 17, 17
+ xxlor 18+32, 18, 18
+ xxlor 19+32, 19, 19
+ xxlor 20+32, 20, 20
+ xxlor 21+32, 21, 21
+ xxlor 22+32, 22, 22
+
+ # ghash here
+ ppc_aes_gcm_ghash2_4x
+
+ xxlor 27+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vmr 29, 30
+ vxor 15, 30, 27 # add round key
+ vaddudm 30, 30, 31
+ vxor 16, 30, 27
+ vaddudm 30, 30, 31
+ vxor 17, 30, 27
+ vaddudm 30, 30, 31
+ vxor 18, 30, 27
+ vaddudm 30, 30, 31
+ vxor 19, 30, 27
+ vaddudm 30, 30, 31
+ vxor 20, 30, 27
+ vaddudm 30, 30, 31
+ vxor 21, 30, 27
+ vaddudm 30, 30, 31
+ vxor 22, 30, 27
+
+ addi 12, 12, -128
+ addi 11, 11, 128
+
+ bdnz Loop_8x_block_dec
+
+ vmr 30, 29
+ stxvb16x 30+32, 0, 7 # update IV
+
+Loop_last_block_dec:
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+ # loop last few blocks
+ li 10, 16
+ divdu 10, 12, 10
+
+ mtctr 10
+
+ lwz 10, 240(6)
+
+ cmpdi 12, 16
+ blt Final_block_dec
+
+Next_rem_block_dec:
+ lxvb16x 15, 0, 14 # load block
+
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_next_1x_dec
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_next_1x_dec
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_next_1x_dec
+
+Do_next_1x_dec:
+ vcipherlast 15, 15, 23
+
+ xxlxor 47, 47, 15
+ stxvb16x 47, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
+
+ xxlor 28+32, 15, 15
+ #vmr 28, 15
+ ppc_update_hash_1x
+
+ addi 12, 12, -16
+ addi 11, 11, 16
+ xxlor 19+32, 0, 0
+ vaddudm 30, 30, 31 # IV + counter
+ vxor 15, 30, 19 # add round key
+
+ bdnz Next_rem_block_dec
+
+ li 15, 0
+ std 15, 56(7) # clear partial?
+ stxvb16x 30+32, 0, 7 # update IV
+ cmpdi 12, 0
+ beq aes_gcm_out
+
+Final_block_dec:
+ lwz 10, 240(6)
+ Loop_aes_middle_1x
+
+ xxlor 23+32, 10, 10
+
+ cmpdi 10, 10
+ beq Do_final_1x_dec
+
+ # 192 bits
+ xxlor 24+32, 11, 11
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 12, 12
+
+ cmpdi 10, 12
+ beq Do_final_1x_dec
+
+ # 256 bits
+ xxlor 24+32, 13, 13
+
+ vcipher 15, 15, 23
+ vcipher 15, 15, 24
+
+ xxlor 23+32, 14, 14
+
+ cmpdi 10, 14
+ beq Do_final_1x_dec
+
+Do_final_1x_dec:
+ vcipherlast 15, 15, 23
+
+ # check partial block
+ li 21, 1 # decrypt
+ ld 15, 56(7) # partial?
+ cmpdi 15, 0
+ beq Normal_block_dec
+ bl Do_partial_block
+ cmpdi 12, 0
+ ble aes_gcm_out
+
+ b Continue_partial_check_dec
+
+Normal_block_dec:
+ lxvb16x 15, 0, 14 # load last block
+ xxlxor 47, 47, 15
+
+ # create partial block mask
+ li 15, 16
+ sub 15, 15, 12 # index to the mask
+
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
+ li 10, 192
+ stvx 16, 10, 1
+ addi 10, 10, 16
+ stvx 17, 10, 1
+
+ addi 10, 1, 192
+ lxvb16x 16, 15, 10 # load partial block mask
+ xxland 47, 47, 16
+
+ xxland 32+28, 15, 16
+ #vmr 28, 15
+ ppc_update_hash_1x
+
+ # * should store only the remaining bytes.
+ bl Write_partial_block
+
+ stxvb16x 30+32, 0, 7 # update IV
+ std 12, 56(7) # update partial?
+ li 16, 16
+
+ stxvb16x 32, 0, 8 # write out Xi
+ stxvb16x 32, 16, 8 # write out Xi
+ b aes_gcm_out
--
2.31.1