Re: [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.

From: Michael Ellerman
Date: Wed Aug 30 2023 - 00:38:16 EST


Danny Tsen <dtsen@xxxxxxxxxxxxx> writes:
> Improve AES/XTS performance of 6-way unrolling for PowerPC up
> to 17% with tcrypt. This is done by using one instruction,
> vpermxor, to replace xor and vsldoi.
>
> This patch has been tested with the kernel crypto module tcrypt.ko and
> has passed the selftest. The patch is also tested with
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>
> Signed-off-by: Danny Tsen <dtsen@xxxxxxxxxxxxx>
> ---
> drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
> 1 file changed, 92 insertions(+), 49 deletions(-)

That's CRYPTOGAMS code, and is so far largely unchanged from the
original. I see you've sent the same change to openssl, but it's not
merged yet. Please document that in the change log, we want to keep the
code in sync as much as possible, and document any divergences.

cheers

> diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
> index 50a0a18f35da..f729589d792e 100644
> --- a/drivers/crypto/vmx/aesp8-ppc.pl
> +++ b/drivers/crypto/vmx/aesp8-ppc.pl
> @@ -132,11 +132,12 @@ rcon:
> .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
> .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
> .long 0,0,0,0 ?asis
> +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
> Lconsts:
> mflr r0
> bcl 20,31,\$+4
> mflr $ptr #vvvvv "distance between . and rcon
> - addi $ptr,$ptr,-0x48
> + addi $ptr,$ptr,-0x58
> mtlr r0
> blr
> .long 0
> @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
> li $x70,0x70
> mtspr 256,r0
>
> + xxlor 2, 32+$eighty7, 32+$eighty7
> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
> + xxlor 1, 32+$eighty7, 32+$eighty7
> +
> + # Load XOR Lconsts.
> + mr $x70, r6
> + bl Lconsts
> + lxvw4x 0, $x40, r6 # load XOR contents
> + mr r6, $x70
> + li $x70,0x70
> +
> subi $rounds,$rounds,3 # -4 in total
>
> lvx $rndkey0,$x00,$key1 # load key schedule
> @@ -2537,69 +2549,77 @@ Load_xts_enc_key:
> ?vperm v31,v31,$twk5,$keyperm
> lvx v25,$x10,$key_ # pre-load round[2]
>
> + # Switch to use the following codes with 0x010101..87 to generate tweak.
> + # eighty7 = 0x010101..87
> + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
> + # vand tmp, tmp, eighty7 # last byte with carry
> + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
> + # xxlor vsx, 0, 0
> + # vpermxor tweak, tweak, tmp, vsx
> +
> vperm $in0,$inout,$inptail,$inpperm
> subi $inp,$inp,31 # undo "caller"
> vxor $twk0,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vand $tmp,$tmp,$eighty7
> vxor $out0,$in0,$twk0
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
>
> lvx_u $in1,$x10,$inp
> vxor $twk1,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in1,$in1,$in1,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out1,$in1,$twk1
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
>
> lvx_u $in2,$x20,$inp
> andi. $taillen,$len,15
> vxor $twk2,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in2,$in2,$in2,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out2,$in2,$twk2
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
>
> lvx_u $in3,$x30,$inp
> sub $len,$len,$taillen
> vxor $twk3,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in3,$in3,$in3,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out3,$in3,$twk3
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
>
> lvx_u $in4,$x40,$inp
> subi $len,$len,0x60
> vxor $twk4,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in4,$in4,$in4,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out4,$in4,$twk4
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
>
> lvx_u $in5,$x50,$inp
> addi $inp,$inp,0x60
> vxor $twk5,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in5,$in5,$in5,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out5,$in5,$twk5
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
>
> vxor v31,v31,$rndkey0
> mtctr $rounds
> @@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
> lvx v25,$x10,$key_ # round[4]
> bdnz Loop_xts_enc6x
>
> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
> +
> subic $len,$len,96 # $len-=96
> vxor $in0,$twk0,v31 # xor with last round key
> vcipher $out0,$out0,v24
> @@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
> vaddubm $tweak,$tweak,$tweak
> vcipher $out2,$out2,v24
> vcipher $out3,$out3,v24
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out4,$out4,v24
> vcipher $out5,$out5,v24
>
> @@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
> vand $tmp,$tmp,$eighty7
> vcipher $out0,$out0,v25
> vcipher $out1,$out1,v25
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
> vcipher $out2,$out2,v25
> vcipher $out3,$out3,v25
> vxor $in1,$twk1,v31
> @@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
>
> and r0,r0,$len
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out0,$out0,v26
> vcipher $out1,$out1,v26
> vand $tmp,$tmp,$eighty7
> vcipher $out2,$out2,v26
> vcipher $out3,$out3,v26
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
> vcipher $out4,$out4,v26
> vcipher $out5,$out5,v26
>
> @@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
> vaddubm $tweak,$tweak,$tweak
> vcipher $out0,$out0,v27
> vcipher $out1,$out1,v27
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out2,$out2,v27
> vcipher $out3,$out3,v27
> vand $tmp,$tmp,$eighty7
> @@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
> vcipher $out5,$out5,v27
>
> addi $key_,$sp,$FRAME+15 # rewind $key_
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
> vcipher $out0,$out0,v28
> vcipher $out1,$out1,v28
> vxor $in3,$twk3,v31
> @@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
> vcipher $out2,$out2,v28
> vcipher $out3,$out3,v28
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out4,$out4,v28
> vcipher $out5,$out5,v28
> lvx v24,$x00,$key_ # re-pre-load round[1]
> @@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
>
> vcipher $out0,$out0,v29
> vcipher $out1,$out1,v29
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
> vcipher $out2,$out2,v29
> vcipher $out3,$out3,v29
> vxor $in4,$twk4,v31
> @@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
> vcipher $out5,$out5,v29
> lvx v25,$x10,$key_ # re-pre-load round[2]
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
>
> vcipher $out0,$out0,v30
> vcipher $out1,$out1,v30
> vand $tmp,$tmp,$eighty7
> vcipher $out2,$out2,v30
> vcipher $out3,$out3,v30
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
> vcipher $out4,$out4,v30
> vcipher $out5,$out5,v30
> vxor $in5,$twk5,v31
> @@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
> vcipherlast $out0,$out0,$in0
> lvx_u $in0,$x00,$inp # load next input block
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vcipherlast $out1,$out1,$in1
> lvx_u $in1,$x10,$inp
> vcipherlast $out2,$out2,$in2
> @@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
> vcipherlast $out4,$out4,$in4
> le?vperm $in2,$in2,$in2,$leperm
> lvx_u $in4,$x40,$inp
> - vxor $tweak,$tweak,$tmp
> + xxlor 10, 32+$in0, 32+$in0
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
> + xxlor 32+$in0, 10, 10
> vcipherlast $tmp,$out5,$in5 # last block might be needed
> # in stealing mode
> le?vperm $in3,$in3,$in3,$leperm
> @@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
> mtctr $rounds
> beq Loop_xts_enc6x # did $len-=96 borrow?
>
> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
> +
> addic. $len,$len,0x60
> beq Lxts_enc6x_zero
> cmpwi $len,0x20
> @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
> li $x70,0x70
> mtspr 256,r0
>
> + xxlor 2, 32+$eighty7, 32+$eighty7
> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
> + xxlor 1, 32+$eighty7, 32+$eighty7
> +
> + # Load XOR Lconsts.
> + mr $x70, r6
> + bl Lconsts
> + lxvw4x 0, $x40, r6 # load XOR contents
> + mr r6, $x70
> + li $x70,0x70
> +
> subi $rounds,$rounds,3 # -4 in total
>
> lvx $rndkey0,$x00,$key1 # load key schedule
> @@ -3194,64 +3231,64 @@ Load_xts_dec_key:
> vxor $twk0,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vand $tmp,$tmp,$eighty7
> vxor $out0,$in0,$twk0
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
>
> lvx_u $in1,$x10,$inp
> vxor $twk1,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in1,$in1,$in1,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out1,$in1,$twk1
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
>
> lvx_u $in2,$x20,$inp
> andi. $taillen,$len,15
> vxor $twk2,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in2,$in2,$in2,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out2,$in2,$twk2
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
>
> lvx_u $in3,$x30,$inp
> sub $len,$len,$taillen
> vxor $twk3,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in3,$in3,$in3,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out3,$in3,$twk3
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
>
> lvx_u $in4,$x40,$inp
> subi $len,$len,0x60
> vxor $twk4,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in4,$in4,$in4,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out4,$in4,$twk4
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
>
> lvx_u $in5,$x50,$inp
> addi $inp,$inp,0x60
> vxor $twk5,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in5,$in5,$in5,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out5,$in5,$twk5
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
>
> vxor v31,v31,$rndkey0
> mtctr $rounds
> @@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
> lvx v25,$x10,$key_ # round[4]
> bdnz Loop_xts_dec6x
>
> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
> +
> subic $len,$len,96 # $len-=96
> vxor $in0,$twk0,v31 # xor with last round key
> vncipher $out0,$out0,v24
> @@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
> vaddubm $tweak,$tweak,$tweak
> vncipher $out2,$out2,v24
> vncipher $out3,$out3,v24
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out4,$out4,v24
> vncipher $out5,$out5,v24
>
> @@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
> vand $tmp,$tmp,$eighty7
> vncipher $out0,$out0,v25
> vncipher $out1,$out1,v25
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
> vncipher $out2,$out2,v25
> vncipher $out3,$out3,v25
> vxor $in1,$twk1,v31
> @@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
>
> and r0,r0,$len
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out0,$out0,v26
> vncipher $out1,$out1,v26
> vand $tmp,$tmp,$eighty7
> vncipher $out2,$out2,v26
> vncipher $out3,$out3,v26
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
> vncipher $out4,$out4,v26
> vncipher $out5,$out5,v26
>
> @@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
> vaddubm $tweak,$tweak,$tweak
> vncipher $out0,$out0,v27
> vncipher $out1,$out1,v27
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out2,$out2,v27
> vncipher $out3,$out3,v27
> vand $tmp,$tmp,$eighty7
> @@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
> vncipher $out5,$out5,v27
>
> addi $key_,$sp,$FRAME+15 # rewind $key_
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
> vncipher $out0,$out0,v28
> vncipher $out1,$out1,v28
> vxor $in3,$twk3,v31
> @@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
> vncipher $out2,$out2,v28
> vncipher $out3,$out3,v28
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out4,$out4,v28
> vncipher $out5,$out5,v28
> lvx v24,$x00,$key_ # re-pre-load round[1]
> @@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
>
> vncipher $out0,$out0,v29
> vncipher $out1,$out1,v29
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
> vncipher $out2,$out2,v29
> vncipher $out3,$out3,v29
> vxor $in4,$twk4,v31
> @@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
> vncipher $out5,$out5,v29
> lvx v25,$x10,$key_ # re-pre-load round[2]
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
>
> vncipher $out0,$out0,v30
> vncipher $out1,$out1,v30
> vand $tmp,$tmp,$eighty7
> vncipher $out2,$out2,v30
> vncipher $out3,$out3,v30
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
> vncipher $out4,$out4,v30
> vncipher $out5,$out5,v30
> vxor $in5,$twk5,v31
> @@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
> vncipherlast $out0,$out0,$in0
> lvx_u $in0,$x00,$inp # load next input block
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vncipherlast $out1,$out1,$in1
> lvx_u $in1,$x10,$inp
> vncipherlast $out2,$out2,$in2
> @@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
> vncipherlast $out4,$out4,$in4
> le?vperm $in2,$in2,$in2,$leperm
> lvx_u $in4,$x40,$inp
> - vxor $tweak,$tweak,$tmp
> + xxlor 10, 32+$in0, 32+$in0
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
> + xxlor 32+$in0, 10, 10
> vncipherlast $out5,$out5,$in5
> le?vperm $in3,$in3,$in3,$leperm
> lvx_u $in5,$x50,$inp
> @@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
> mtctr $rounds
> beq Loop_xts_dec6x # did $len-=96 borrow?
>
> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
> +
> addic. $len,$len,0x60
> beq Lxts_dec6x_zero
> cmpwi $len,0x20
> --
> 2.31.1