[PATCH 3/3] crypto: x86/aes-xts - optimize size of instructions operating on lengths

From: Eric Biggers
Date: Fri Apr 12 2024 - 23:21:41 EST


From: Eric Biggers <ebiggers@xxxxxxxxxx>

x86_64 has the "interesting" property that the instruction size is
generally a bit shorter for instructions that operate on the 32-bit (or
less) part of registers, or registers that are in the original set of 8.

This patch adjusts the AES-XTS code to take advantage of that property
by changing the LEN parameter from size_t to unsigned int (which is all
that's needed and is what the non-AVX implementation uses) and using the
%eax register for KEYLEN.

This decreases the size of aes-xts-avx-x86_64.o by 1.2%.

Note that changing the kmovq to kmovd was going to be needed anyway to
make the AVX10/256 code really work on CPUs that don't support 512-bit
vectors (since the AVX10 spec says that 64-bit opmask instructions will
only be supported on processors that support 512-bit vectors).

Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
---
arch/x86/crypto/aes-xts-avx-x86_64.S | 40 +++++++++++++++-------------
arch/x86/crypto/aesni-intel_glue.c | 18 ++++++-------
2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 802d3b90d337..48f97b79f7a9 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -83,18 +83,20 @@
// Function parameters
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
// advanced to point to 7th-from-last round key
.set SRC, %rsi // Pointer to next source data
.set DST, %rdx // Pointer to next destination data
-.set LEN, %rcx // Remaining length in bytes
+.set LEN, %ecx // Remaining length in bytes
+.set LEN8, %cl
+.set LEN64, %rcx
.set TWEAK, %r8 // Pointer to next tweak

-// %r9 holds the AES key length in bytes.
-.set KEYLEN, %r9d
-.set KEYLEN64, %r9
+// %rax holds the AES key length in bytes.
+.set KEYLEN, %eax
+.set KEYLEN64, %rax

-// %rax and %r10-r11 are available as temporaries.
+// %r9-r11 are available as temporaries.

.macro _define_Vi i
.if VL == 16
.set V\i, %xmm\i
.elseif VL == 32
@@ -563,13 +565,13 @@
// When decrypting a message whose length isn't a multiple of the AES
// block length, exclude the last full block from the main loop by
// subtracting 16 from LEN. This is needed because ciphertext stealing
// decryption uses the last two tweaks in reverse order. We'll handle
// the last full block and the partial block specially at the end.
- lea -16(LEN), %rax
- test $15, LEN
- cmovnz %rax, LEN
+ lea -16(LEN), %eax
+ test $15, LEN8
+ cmovnz %eax, LEN
.endif

// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
movl 480(KEY), KEYLEN

@@ -648,11 +650,11 @@
jge .Lmain_loop\@

// Check for the uncommon case where the data length isn't a multiple of
// 4*VL. Handle it out-of-line in order to optimize for the common
// case. In the common case, just fall through to the ret.
- test $4*VL-1, LEN
+ test $4*VL-1, LEN8
jnz .Lhandle_remainder\@
.Ldone\@:
// Store the next tweak back to *TWEAK to support continuation calls.
vmovdqu TWEAK0_XMM, (TWEAK)
.if VL > 16
@@ -716,39 +718,39 @@
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
.endif

.if USE_AVX10
// Create a mask that has the first LEN bits set.
- mov $-1, %rax
- bzhi LEN, %rax, %rax
- kmovq %rax, %k1
+ mov $-1, %r9d
+ bzhi LEN, %r9d, %r9d
+ kmovd %r9d, %k1

// Swap the first LEN bytes of the en/decryption of the last full block
// with the partial block. Note that to support in-place en/decryption,
// the load from the src partial block must happen before the store to
// the dst partial block.
vmovdqa %xmm0, %xmm1
vmovdqu8 16(SRC), %xmm0{%k1}
vmovdqu8 %xmm1, 16(DST){%k1}
.else
- lea .Lcts_permute_table(%rip), %rax
+ lea .Lcts_permute_table(%rip), %r9

// Load the src partial block, left-aligned. Note that to support
// in-place en/decryption, this must happen before the store to the dst
// partial block.
- vmovdqu (SRC, LEN, 1), %xmm1
+ vmovdqu (SRC, LEN64, 1), %xmm1

// Shift the first LEN bytes of the en/decryption of the last full block
// to the end of a register, then store it to DST+LEN. This stores the
// dst partial block. It also writes to the second part of the dst last
// full block, but that part is overwritten later.
- vpshufb (%rax, LEN, 1), %xmm0, %xmm2
- vmovdqu %xmm2, (DST, LEN, 1)
+ vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
+ vmovdqu %xmm2, (DST, LEN64, 1)

// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
- sub LEN, %rax
- vmovdqu 32(%rax), %xmm3
+ sub LEN64, %r9
+ vmovdqu 32(%r9), %xmm3

// Shift the src partial block to the beginning of its register.
vpshufb %xmm3, %xmm1, %xmm1

// Do a blend to generate the src partial block followed by the second
@@ -793,11 +795,11 @@ SYM_FUNC_END(aes_xts_encrypt_iv)

// Below are the actual AES-XTS encryption and decryption functions,
// instantiated from the above macro. They all have the following prototype:
//
// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
-// const u8 *src, u8 *dst, size_t len,
+// const u8 *src, u8 *dst, unsigned int len,
// u8 tweak[AES_BLOCK_SIZE]);
//
// |key| is the data key. |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done. This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e7d21000cb05..110b3282a1f2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -897,11 +897,11 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
}

typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]);
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, size_t len,
+ const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE]);

/* This handles cases where the source and/or destination span pages. */
static noinline int
xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
@@ -1019,18 +1019,18 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
{
aesni_enc(tweak_key, iv, iv);
}

static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, size_t len,
+ const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE])
{
aesni_xts_enc(key, dst, src, len, tweak);
}

static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, size_t len,
+ const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE])
{
aesni_xts_dec(key, dst, src, len, tweak);
}

@@ -1183,16 +1183,16 @@ static struct simd_skcipher_alg *aesni_simd_xctr;
asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]);

#define DEFINE_XTS_ALG(suffix, driver_name, priority) \
\
-asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \
- const u8 *src, u8 *dst, size_t len, \
- u8 tweak[AES_BLOCK_SIZE]); \
-asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \
- const u8 *src, u8 *dst, size_t len, \
- u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
\
static int xts_encrypt_##suffix(struct skcipher_request *req) \
{ \
return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \
} \
--
2.44.0