Re: [PATCH] crypto: x86/aesni-xts - deduplicate aesni_xts_enc() and aesni_xts_dec()

From: Ard Biesheuvel
Date: Sat Apr 13 2024 - 04:54:23 EST


On Sat, 13 Apr 2024 at 02:10, Eric Biggers <ebiggers@xxxxxxxxxx> wrote:
>
> From: Eric Biggers <ebiggers@xxxxxxxxxx>
>
> Since aesni_xts_enc() and aesni_xts_dec() are very similar, generate
> them from a macro that's passed an argument enc=1 or enc=0. This
> reduces the length of aesni-intel_asm.S by 112 lines while still
> producing the exact same object file in both 32-bit and 64-bit mode.
>
> Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx>
> ---
> arch/x86/crypto/aesni-intel_asm.S | 270 +++++++++---------------------
> 1 file changed, 79 insertions(+), 191 deletions(-)
>

Reviewed-by: Ard Biesheuvel <ardb@xxxxxxxxxx>

> diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
> index 1cb55eea2efa..3a3e46188dec 100644
> --- a/arch/x86/crypto/aesni-intel_asm.S
> +++ b/arch/x86/crypto/aesni-intel_asm.S
> @@ -2823,32 +2823,28 @@ SYM_FUNC_END(aesni_ctr_enc)
> .Lgf128mul_x_ble_mask:
> .octa 0x00000000000000010000000000000087
> .previous
>
> /*
> - * _aesni_gf128mul_x_ble: internal ABI
> - * Multiply in GF(2^128) for XTS IVs
> + * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
> * input:
> * IV: current IV
> * GF128MUL_MASK == mask with 0x87 and 0x01
> * output:
> * IV: next IV
> * changed:
> - * CTR: == temporary value
> + * KEY: == temporary value
> */
> -#define _aesni_gf128mul_x_ble() \
> - pshufd $0x13, IV, KEY; \
> - paddq IV, IV; \
> - psrad $31, KEY; \
> - pand GF128MUL_MASK, KEY; \
> - pxor KEY, IV;
> +.macro _aesni_gf128mul_x_ble
> + pshufd $0x13, IV, KEY
> + paddq IV, IV
> + psrad $31, KEY
> + pand GF128MUL_MASK, KEY
> + pxor KEY, IV
> +.endm
>
> -/*
> - * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
> - * const u8 *src, unsigned int len, le128 *iv)
> - */
> -SYM_FUNC_START(aesni_xts_enc)
> +.macro _aesni_xts_crypt enc
> FRAME_BEGIN
> #ifndef __x86_64__
> pushl IVP
> pushl LEN
> pushl KEYP
> @@ -2863,39 +2859,50 @@ SYM_FUNC_START(aesni_xts_enc)
> movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
> #endif
> movups (IVP), IV
>
> mov 480(KEYP), KLEN
> +.if !\enc
> + add $240, KEYP
> +
> + test $15, LEN
> + jz .Lxts_loop4\@
> + sub $16, LEN
> +.endif
>
> -.Lxts_enc_loop4:
> +.Lxts_loop4\@:
> sub $64, LEN
> - jl .Lxts_enc_1x
> + jl .Lxts_1x\@
>
> movdqa IV, STATE1
> movdqu 0x00(INP), IN
> pxor IN, STATE1
> movdqu IV, 0x00(OUTP)
>
> - _aesni_gf128mul_x_ble()
> + _aesni_gf128mul_x_ble
> movdqa IV, STATE2
> movdqu 0x10(INP), IN
> pxor IN, STATE2
> movdqu IV, 0x10(OUTP)
>
> - _aesni_gf128mul_x_ble()
> + _aesni_gf128mul_x_ble
> movdqa IV, STATE3
> movdqu 0x20(INP), IN
> pxor IN, STATE3
> movdqu IV, 0x20(OUTP)
>
> - _aesni_gf128mul_x_ble()
> + _aesni_gf128mul_x_ble
> movdqa IV, STATE4
> movdqu 0x30(INP), IN
> pxor IN, STATE4
> movdqu IV, 0x30(OUTP)
>
> +.if \enc
> call _aesni_enc4
> +.else
> + call _aesni_dec4
> +.endif
>
> movdqu 0x00(OUTP), IN
> pxor IN, STATE1
> movdqu STATE1, 0x00(OUTP)
>
> @@ -2909,63 +2916,84 @@ SYM_FUNC_START(aesni_xts_enc)
>
> movdqu 0x30(OUTP), IN
> pxor IN, STATE4
> movdqu STATE4, 0x30(OUTP)
>
> - _aesni_gf128mul_x_ble()
> + _aesni_gf128mul_x_ble
>
> add $64, INP
> add $64, OUTP
> test LEN, LEN
> - jnz .Lxts_enc_loop4
> + jnz .Lxts_loop4\@
>
> -.Lxts_enc_ret_iv:
> +.Lxts_ret_iv\@:
> movups IV, (IVP)
>
> -.Lxts_enc_ret:
> +.Lxts_ret\@:
> #ifndef __x86_64__
> popl KLEN
> popl KEYP
> popl LEN
> popl IVP
> #endif
> FRAME_END
> RET
>
> -.Lxts_enc_1x:
> +.Lxts_1x\@:
> add $64, LEN
> - jz .Lxts_enc_ret_iv
> + jz .Lxts_ret_iv\@
> +.if \enc
> sub $16, LEN
> - jl .Lxts_enc_cts4
> + jl .Lxts_cts4\@
> +.endif
>
> -.Lxts_enc_loop1:
> +.Lxts_loop1\@:
> movdqu (INP), STATE
> +.if \enc
> pxor IV, STATE
> call _aesni_enc1
> +.else
> + add $16, INP
> + sub $16, LEN
> + jl .Lxts_cts1\@
> pxor IV, STATE
> - _aesni_gf128mul_x_ble()
> + call _aesni_dec1
> +.endif
> + pxor IV, STATE
> + _aesni_gf128mul_x_ble
>
> test LEN, LEN
> - jz .Lxts_enc_out
> + jz .Lxts_out\@
>
> +.if \enc
> add $16, INP
> sub $16, LEN
> - jl .Lxts_enc_cts1
> + jl .Lxts_cts1\@
> +.endif
>
> movdqu STATE, (OUTP)
> add $16, OUTP
> - jmp .Lxts_enc_loop1
> + jmp .Lxts_loop1\@
>
> -.Lxts_enc_out:
> +.Lxts_out\@:
> movdqu STATE, (OUTP)
> - jmp .Lxts_enc_ret_iv
> + jmp .Lxts_ret_iv\@
>
> -.Lxts_enc_cts4:
> +.if \enc
> +.Lxts_cts4\@:
> movdqa STATE4, STATE
> sub $16, OUTP
> +.Lxts_cts1\@:
> +.else
> +.Lxts_cts1\@:
> + movdqa IV, STATE4
> + _aesni_gf128mul_x_ble
>
> -.Lxts_enc_cts1:
> + pxor IV, STATE
> + call _aesni_dec1
> + pxor IV, STATE
> +.endif
> #ifndef __x86_64__
> lea .Lcts_permute_table, T1
> #else
> lea .Lcts_permute_table(%rip), T1
> #endif
> @@ -2987,174 +3015,34 @@ SYM_FUNC_START(aesni_xts_enc)
> movups (IVP), %xmm0
> pshufb %xmm0, IN1
> pblendvb IN2, IN1
> movaps IN1, STATE
>
> +.if \enc
> pxor IV, STATE
> call _aesni_enc1
> pxor IV, STATE
> +.else
> + pxor STATE4, STATE
> + call _aesni_dec1
> + pxor STATE4, STATE
> +.endif
>
> movups STATE, (OUTP)
> - jmp .Lxts_enc_ret
> + jmp .Lxts_ret\@
> +.endm
> +
> +/*
> + * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
> + * const u8 *src, unsigned int len, le128 *iv)
> + */
> +SYM_FUNC_START(aesni_xts_enc)
> + _aesni_xts_crypt 1
> SYM_FUNC_END(aesni_xts_enc)
>
> /*
> * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
> * const u8 *src, unsigned int len, le128 *iv)
> */
> SYM_FUNC_START(aesni_xts_dec)
> - FRAME_BEGIN
> -#ifndef __x86_64__
> - pushl IVP
> - pushl LEN
> - pushl KEYP
> - pushl KLEN
> - movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
> - movl (FRAME_OFFSET+24)(%esp), OUTP # dst
> - movl (FRAME_OFFSET+28)(%esp), INP # src
> - movl (FRAME_OFFSET+32)(%esp), LEN # len
> - movl (FRAME_OFFSET+36)(%esp), IVP # iv
> - movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
> -#else
> - movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
> -#endif
> - movups (IVP), IV
> -
> - mov 480(KEYP), KLEN
> - add $240, KEYP
> -
> - test $15, LEN
> - jz .Lxts_dec_loop4
> - sub $16, LEN
> -
> -.Lxts_dec_loop4:
> - sub $64, LEN
> - jl .Lxts_dec_1x
> -
> - movdqa IV, STATE1
> - movdqu 0x00(INP), IN
> - pxor IN, STATE1
> - movdqu IV, 0x00(OUTP)
> -
> - _aesni_gf128mul_x_ble()
> - movdqa IV, STATE2
> - movdqu 0x10(INP), IN
> - pxor IN, STATE2
> - movdqu IV, 0x10(OUTP)
> -
> - _aesni_gf128mul_x_ble()
> - movdqa IV, STATE3
> - movdqu 0x20(INP), IN
> - pxor IN, STATE3
> - movdqu IV, 0x20(OUTP)
> -
> - _aesni_gf128mul_x_ble()
> - movdqa IV, STATE4
> - movdqu 0x30(INP), IN
> - pxor IN, STATE4
> - movdqu IV, 0x30(OUTP)
> -
> - call _aesni_dec4
> -
> - movdqu 0x00(OUTP), IN
> - pxor IN, STATE1
> - movdqu STATE1, 0x00(OUTP)
> -
> - movdqu 0x10(OUTP), IN
> - pxor IN, STATE2
> - movdqu STATE2, 0x10(OUTP)
> -
> - movdqu 0x20(OUTP), IN
> - pxor IN, STATE3
> - movdqu STATE3, 0x20(OUTP)
> -
> - movdqu 0x30(OUTP), IN
> - pxor IN, STATE4
> - movdqu STATE4, 0x30(OUTP)
> -
> - _aesni_gf128mul_x_ble()
> -
> - add $64, INP
> - add $64, OUTP
> - test LEN, LEN
> - jnz .Lxts_dec_loop4
> -
> -.Lxts_dec_ret_iv:
> - movups IV, (IVP)
> -
> -.Lxts_dec_ret:
> -#ifndef __x86_64__
> - popl KLEN
> - popl KEYP
> - popl LEN
> - popl IVP
> -#endif
> - FRAME_END
> - RET
> -
> -.Lxts_dec_1x:
> - add $64, LEN
> - jz .Lxts_dec_ret_iv
> -
> -.Lxts_dec_loop1:
> - movdqu (INP), STATE
> -
> - add $16, INP
> - sub $16, LEN
> - jl .Lxts_dec_cts1
> -
> - pxor IV, STATE
> - call _aesni_dec1
> - pxor IV, STATE
> - _aesni_gf128mul_x_ble()
> -
> - test LEN, LEN
> - jz .Lxts_dec_out
> -
> - movdqu STATE, (OUTP)
> - add $16, OUTP
> - jmp .Lxts_dec_loop1
> -
> -.Lxts_dec_out:
> - movdqu STATE, (OUTP)
> - jmp .Lxts_dec_ret_iv
> -
> -.Lxts_dec_cts1:
> - movdqa IV, STATE4
> - _aesni_gf128mul_x_ble()
> -
> - pxor IV, STATE
> - call _aesni_dec1
> - pxor IV, STATE
> -
> -#ifndef __x86_64__
> - lea .Lcts_permute_table, T1
> -#else
> - lea .Lcts_permute_table(%rip), T1
> -#endif
> - add LEN, INP /* rewind input pointer */
> - add $16, LEN /* # bytes in final block */
> - movups (INP), IN1
> -
> - mov T1, IVP
> - add $32, IVP
> - add LEN, T1
> - sub LEN, IVP
> - add OUTP, LEN
> -
> - movups (T1), %xmm4
> - movaps STATE, IN2
> - pshufb %xmm4, STATE
> - movups STATE, (LEN)
> -
> - movups (IVP), %xmm0
> - pshufb %xmm0, IN1
> - pblendvb IN2, IN1
> - movaps IN1, STATE
> -
> - pxor STATE4, STATE
> - call _aesni_dec1
> - pxor STATE4, STATE
> -
> - movups STATE, (OUTP)
> - jmp .Lxts_dec_ret
> + _aesni_xts_crypt 0
> SYM_FUNC_END(aesni_xts_dec)
>
> base-commit: 751fb2528c12ef64d1e863efb196cdc968b384f6
> --
> 2.44.0
>
>