[PATCH 2/3] crypto: x86/aes-xts - eliminate a few more instructions

From: Eric Biggers
Date: Fri Apr 12 2024 - 23:21:40 EST


From: Eric Biggers <ebiggers@xxxxxxxxxx>

- For conditionally subtracting 16 from LEN when decrypting a message
whose length isn't a multiple of 16, use the cmovnz instruction.

- Fold the addition of 4*VL to LEN into the sub of VL or 16 from LEN.

- Remove an unnecessary test instruction.

This results in slightly shorter code, both source and binary.

Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
---
arch/x86/crypto/aes-xts-avx-x86_64.S | 39 ++++++++++------------------
1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index f5e7ab739105..802d3b90d337 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -557,24 +557,24 @@
.endm

.macro _aes_xts_crypt enc
_define_aliases

- // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
- movl 480(KEY), KEYLEN
-
.if !\enc
// When decrypting a message whose length isn't a multiple of the AES
// block length, exclude the last full block from the main loop by
// subtracting 16 from LEN. This is needed because ciphertext stealing
// decryption uses the last two tweaks in reverse order. We'll handle
// the last full block and the partial block specially at the end.
+ lea -16(LEN), %rax
test $15, LEN
- jnz .Lneed_cts_dec\@
-.Lxts_init\@:
+ cmovnz %rax, LEN
.endif

+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+ movl 480(KEY), KEYLEN
+
// Setup the pointer to the round keys and cache as many as possible.
_setup_round_keys \enc

// Compute the first set of tweaks TWEAK[0-3].
_compute_first_set_of_tweaks
@@ -659,15 +659,14 @@
vzeroupper
.endif
RET

.Lhandle_remainder\@:
- add $4*VL, LEN // Undo the extra sub from earlier.

// En/decrypt any remaining full blocks, one vector at a time.
.if VL > 16
- sub $VL, LEN
+ add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
jl .Lvec_at_a_time_done\@
.Lvec_at_a_time\@:
_vmovdqu (SRC), V0
_aes_crypt \enc, , TWEAK0, V0
_vmovdqu V0, (DST)
@@ -675,13 +674,13 @@
add $VL, SRC
add $VL, DST
sub $VL, LEN
jge .Lvec_at_a_time\@
.Lvec_at_a_time_done\@:
- add $VL-16, LEN // Undo the extra sub from earlier.
+ add $VL-16, LEN // Undo extra sub of VL, then sub 16.
.else
- sub $16, LEN
+ add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
.endif

// En/decrypt any remaining full blocks, one at a time.
jl .Lblock_at_a_time_done\@
.Lblock_at_a_time\@:
@@ -692,28 +691,16 @@
add $16, SRC
add $16, DST
sub $16, LEN
jge .Lblock_at_a_time\@
.Lblock_at_a_time_done\@:
- add $16, LEN // Undo the extra sub from earlier.
-
-.Lfull_blocks_done\@:
- // Now 0 <= LEN <= 15. If LEN is nonzero, do ciphertext stealing to
- // process the last 16 + LEN bytes. If LEN is zero, we're done.
- test LEN, LEN
- jnz .Lcts\@
- jmp .Ldone\@
-
-.if !\enc
-.Lneed_cts_dec\@:
- sub $16, LEN
- jmp .Lxts_init\@
-.endif
+ add $16, LEN // Undo the extra sub of 16.
+ // Now 0 <= LEN <= 15. If LEN is zero, we're done.
+ jz .Ldone\@

-.Lcts\@:
- // Do ciphertext stealing (CTS) to en/decrypt the last full block and
- // the partial block. TWEAK0_XMM contains the next tweak.
+ // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+ // Do ciphertext stealing to process the last 16 + LEN bytes.

.if \enc
// If encrypting, the main loop already encrypted the last full block to
// create the CTS intermediate ciphertext. Prepare for the rest of CTS
// by rewinding the pointers and loading the intermediate ciphertext.
--
2.44.0