[PATCH] crypto: x86/aes-xts - access round keys using single-byte offsets

From: Eric Biggers
Date: Mon Apr 08 2024 - 20:02:26 EST


From: Eric Biggers <ebiggers@xxxxxxxxxx>

Access the AES round keys using offsets -7*16 through 7*16, instead of
0*16 through 14*16. This allows VEX-encoded instructions to address all
round keys using 1-byte offsets, whereas before some needed 4-byte
offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.

Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
---
arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++-------------
1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index fcaf64a2f8c6..95e412e7601d 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -80,11 +80,11 @@
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.text

// Function parameters
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
- // advanced to point directly to the round keys
+ // advanced to point directly to 7th round key
.set SRC, %rsi // Pointer to next source data
.set DST, %rdx // Pointer to next destination data
.set LEN, %rcx // Remaining length in bytes
.set TWEAK, %r8 // Pointer to next tweak

@@ -406,28 +406,28 @@
.endif
.endm

// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
.macro _load_round_keys
- _vbroadcast128 0*16(KEY), KEY0
+ _vbroadcast128 -7*16(KEY), KEY0
.if USE_AVX10
- _vbroadcast128 1*16(KEY), KEY1
- _vbroadcast128 2*16(KEY), KEY2
- _vbroadcast128 3*16(KEY), KEY3
- _vbroadcast128 4*16(KEY), KEY4
- _vbroadcast128 5*16(KEY), KEY5
- _vbroadcast128 6*16(KEY), KEY6
- _vbroadcast128 7*16(KEY), KEY7
- _vbroadcast128 8*16(KEY), KEY8
- _vbroadcast128 9*16(KEY), KEY9
- _vbroadcast128 10*16(KEY), KEY10
+ _vbroadcast128 -6*16(KEY), KEY1
+ _vbroadcast128 -5*16(KEY), KEY2
+ _vbroadcast128 -4*16(KEY), KEY3
+ _vbroadcast128 -3*16(KEY), KEY4
+ _vbroadcast128 -2*16(KEY), KEY5
+ _vbroadcast128 -1*16(KEY), KEY6
+ _vbroadcast128 0*16(KEY), KEY7
+ _vbroadcast128 1*16(KEY), KEY8
+ _vbroadcast128 2*16(KEY), KEY9
+ _vbroadcast128 3*16(KEY), KEY10
// Note: if it's AES-128 or AES-192, the last several round keys won't
// be used. We do the loads anyway to save a conditional jump.
- _vbroadcast128 11*16(KEY), KEY11
- _vbroadcast128 12*16(KEY), KEY12
- _vbroadcast128 13*16(KEY), KEY13
- _vbroadcast128 14*16(KEY), KEY14
+ _vbroadcast128 4*16(KEY), KEY11
+ _vbroadcast128 5*16(KEY), KEY12
+ _vbroadcast128 6*16(KEY), KEY13
+ _vbroadcast128 7*16(KEY), KEY14
.endif
.endm

// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
// on the block(s) in \data using the round key(s) in \key. The register length
@@ -454,13 +454,13 @@
.macro _vaes_1x enc, last, i, xmm_suffix, data
.if USE_AVX10
_vaes \enc, \last, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
- _vaes \enc, \last, \i*16(KEY), \data
+ _vaes \enc, \last, (\i-7)*16(KEY), \data
.else
- _vbroadcast128 \i*16(KEY), V4
+ _vbroadcast128 (\i-7)*16(KEY), V4
_vaes \enc, \last, V4, \data
.endif
.endif
.endm

@@ -475,11 +475,11 @@
_vaes \enc, \last, KEY\i, V1
_tweak_step (2*(\i-1) + 1)
_vaes \enc, \last, KEY\i, V2
_vaes \enc, \last, KEY\i, V3
.else
- _vbroadcast128 \i*16(KEY), V4
+ _vbroadcast128 (\i-7)*16(KEY), V4
_tweak_step (2*(\i-1))
_vaes \enc, \last, V4, V0
_vaes \enc, \last, V4, V1
_tweak_step (2*(\i-1) + 1)
_vaes \enc, \last, V4, V2
@@ -526,13 +526,19 @@
_define_aliases

// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
movl 480(KEY), KEYLEN

- // If decrypting, advance KEY to the decryption round keys.
-.if !\enc
- add $240, KEY
+ // Advance KEY to point to the 7th encryption round key (if encrypting)
+ // or the 7th decryption round key (if decrypting). This makes the
+ // offset to any round key be in the range [-112, 112], fitting in a
+ // signed byte. This shortens VEX-encoded instructions that access the
+ // 8th and later round keys which otherwise would need 4-byte offsets.
+.if \enc
+ add $7*16, KEY
+.else
+ add $(15+7)*16, KEY
.endif

// Check whether the data length is a multiple of the AES block length.
test $15, LEN
jnz .Lneed_cts\@
@@ -751,40 +757,41 @@

// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
// u8 iv[AES_BLOCK_SIZE]);
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
vmovdqu (%rsi), %xmm0
- vpxor 0*16(%rdi), %xmm0, %xmm0
+ add $7*16, %rdi
+ vpxor -7*16(%rdi), %xmm0, %xmm0
+ vaesenc -6*16(%rdi), %xmm0, %xmm0
+ vaesenc -5*16(%rdi), %xmm0, %xmm0
+ vaesenc -4*16(%rdi), %xmm0, %xmm0
+ vaesenc -3*16(%rdi), %xmm0, %xmm0
+ vaesenc -2*16(%rdi), %xmm0, %xmm0
+ vaesenc -1*16(%rdi), %xmm0, %xmm0
+ vaesenc 0*16(%rdi), %xmm0, %xmm0
vaesenc 1*16(%rdi), %xmm0, %xmm0
vaesenc 2*16(%rdi), %xmm0, %xmm0
+ cmpl $24, 480-(7*16)(%rdi)
+ jle .Lencrypt_iv_aes_128_or_192
vaesenc 3*16(%rdi), %xmm0, %xmm0
vaesenc 4*16(%rdi), %xmm0, %xmm0
vaesenc 5*16(%rdi), %xmm0, %xmm0
vaesenc 6*16(%rdi), %xmm0, %xmm0
- vaesenc 7*16(%rdi), %xmm0, %xmm0
- vaesenc 8*16(%rdi), %xmm0, %xmm0
- vaesenc 9*16(%rdi), %xmm0, %xmm0
- cmpl $24, 480(%rdi)
- jle .Lencrypt_iv_aes_128_or_192
- vaesenc 10*16(%rdi), %xmm0, %xmm0
- vaesenc 11*16(%rdi), %xmm0, %xmm0
- vaesenc 12*16(%rdi), %xmm0, %xmm0
- vaesenc 13*16(%rdi), %xmm0, %xmm0
- vaesenclast 14*16(%rdi), %xmm0, %xmm0
+ vaesenclast 7*16(%rdi), %xmm0, %xmm0
.Lencrypt_iv_done:
vmovdqu %xmm0, (%rsi)
RET

// Out-of-line handling of AES-128 and AES-192
.Lencrypt_iv_aes_128_or_192:
jz .Lencrypt_iv_aes_192
- vaesenclast 10*16(%rdi), %xmm0, %xmm0
+ vaesenclast 3*16(%rdi), %xmm0, %xmm0
jmp .Lencrypt_iv_done
.Lencrypt_iv_aes_192:
- vaesenc 10*16(%rdi), %xmm0, %xmm0
- vaesenc 11*16(%rdi), %xmm0, %xmm0
- vaesenclast 12*16(%rdi), %xmm0, %xmm0
+ vaesenc 3*16(%rdi), %xmm0, %xmm0
+ vaesenc 4*16(%rdi), %xmm0, %xmm0
+ vaesenclast 5*16(%rdi), %xmm0, %xmm0
jmp .Lencrypt_iv_done
SYM_FUNC_END(aes_xts_encrypt_iv)

// Below are the actual AES-XTS encryption and decryption functions,
// instantiated from the above macro. They all have the following prototype:

base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659
prerequisite-patch-id: 8d09ed747039f5e718ac7267e2a15e22504aa7f3
--
2.44.0