[PATCH 06/12] x86/crypto: aesni: Split AAD hash calculation to separate macro

From: Dave Watson
Date: Mon Dec 10 2018 - 14:59:38 EST


AAD hash only needs to be calculated once for each scatter/gather operation.
Move it to its own macro, and call it from GCM_INIT instead of
INITIAL_BLOCKS.

Signed-off-by: Dave Watson <davejwatson@xxxxxx>
---
arch/x86/crypto/aesni-intel_avx-x86_64.S | 228 ++++++++++-------------
arch/x86/crypto/aesni-intel_glue.c | 28 ++-
2 files changed, 115 insertions(+), 141 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 8e9ae4b26118..305abece93ad 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -182,6 +182,14 @@ aad_shift_arr:
.text


+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
HashKey = 16*6 # store HashKey <<1 mod poly here
HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
@@ -585,6 +593,74 @@ _T_16\@:
_return_T_done\@:
.endm

+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
+
+ mov \AAD, %r10 # r10 = AAD
+ mov \AADLEN, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor \T8, \T8, \T8
+ vpxor \T7, \T7, \T7
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), \T7
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T7, \T8, \T8
+ \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu \T8, \T7
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor \T7, \T7, \T7
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, \T7, \T7
+ vpxor \T1, \T7, \T7
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, \T7, \T7
+ vpxor \T1, \T7, \T7
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ vmovdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, \T7, \T7
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T8, \T7, \T7
+ \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+ vmovdqu \T7, AadHash(arg2)
+.endm
+
#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -701,72 +777,9 @@ _return_T_done\@:

.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
- j = 0
setreg
+ vmovdqu AadHash(arg2), reg_i

- mov arg7, %r10 # r10 = AAD
- mov arg8, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_j, reg_j, reg_j
- vpxor reg_i, reg_i, reg_i
- cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
- vmovdqu (%r10), reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge _get_AAD_blocks\@
- vmovdqu reg_j, reg_i
- cmp $0, %r11
- je _get_AAD_done\@
-
- vpxor reg_i, reg_i, reg_i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
- cmp $4, %r11
- jle _get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
- cmp $0, %r11
- jle _get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_j, reg_i, reg_i
- GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
# initialize the data pointer offset as zero
xor %r11d, %r11d

@@ -1535,7 +1548,13 @@ _initial_blocks_done\@:
#void aesni_gcm_precomp_avx_gen2
# (gcm_data *my_ctx_data,
# gcm_context_data *data,
-# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
#############################################################
ENTRY(aesni_gcm_precomp_avx_gen2)
FUNC_SAVE
@@ -1560,6 +1579,8 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly


+ CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
+
PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5

FUNC_RESTORE
@@ -1716,7 +1737,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2)

.endm

-
## if a = number of total plaintext bytes
## b = floor(a/16)
## num_initial_blocks = b mod 4#
@@ -1726,73 +1746,9 @@ ENDPROC(aesni_gcm_dec_avx_gen2)

.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
- j = 0
setreg
+ vmovdqu AadHash(arg2), reg_i

- mov arg7, %r10 # r10 = AAD
- mov arg8, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_j, reg_j, reg_j
- vpxor reg_i, reg_i, reg_i
-
- cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
- vmovdqu (%r10), reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge _get_AAD_blocks\@
- vmovdqu reg_j, reg_i
- cmp $0, %r11
- je _get_AAD_done\@
-
- vpxor reg_i, reg_i, reg_i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
- cmp $4, %r11
- jle _get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
- cmp $0, %r11
- jle _get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_j, reg_i, reg_i
- GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
# initialize the data pointer offset as zero
xor %r11d, %r11d

@@ -2581,8 +2537,13 @@ _initial_blocks_done\@:
#void aesni_gcm_precomp_avx_gen4
# (gcm_data *my_ctx_data,
# gcm_context_data *data,
-# u8 *hash_subkey)# /* H, the Hash sub key input.
-# Data starts on a 16-byte boundary. */
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
#############################################################
ENTRY(aesni_gcm_precomp_avx_gen4)
FUNC_SAVE
@@ -2606,6 +2567,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
#######################################################################
vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly

+ CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0

PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 7d1259feb0f9..2648842f1c3f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -189,7 +189,10 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
*/
asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data,
struct gcm_context_data *gdata,
- u8 *hash_subkey);
+ u8 *hash_subkey,
+ u8 *iv,
+ const u8 *aad,
+ unsigned long aad_len);

asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
@@ -214,7 +217,8 @@ static void aesni_gcm_enc_avx(void *ctx,
plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
- aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+ aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+ aad, aad_len);
aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
aad, aad_len, auth_tag, auth_tag_len);
}
@@ -231,7 +235,8 @@ static void aesni_gcm_dec_avx(void *ctx,
ciphertext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
- aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+ aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+ aad, aad_len);
aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
aad, aad_len, auth_tag, auth_tag_len);
}
@@ -246,7 +251,10 @@ static void aesni_gcm_dec_avx(void *ctx,
*/
asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data,
struct gcm_context_data *gdata,
- u8 *hash_subkey);
+ u8 *hash_subkey,
+ u8 *iv,
+ const u8 *aad,
+ unsigned long aad_len);

asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
@@ -271,11 +279,13 @@ static void aesni_gcm_enc_avx2(void *ctx,
plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
- aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+ aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+ aad, aad_len);
aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
aad, aad_len, auth_tag, auth_tag_len);
} else {
- aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
+ aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv,
+ aad, aad_len);
aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv,
aad, aad_len, auth_tag, auth_tag_len);
}
@@ -292,11 +302,13 @@ static void aesni_gcm_dec_avx2(void *ctx,
ciphertext_len, iv, hash_subkey,
aad, aad_len, auth_tag, auth_tag_len);
} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
- aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+ aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+ aad, aad_len);
aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
aad, aad_len, auth_tag, auth_tag_len);
} else {
- aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
+ aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv,
+ aad, aad_len);
aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv,
aad, aad_len, auth_tag, auth_tag_len);
}
--
2.17.1