crypto fix for unwinder warnings

From: Josh Poimboeuf
Date: Mon Aug 28 2017 - 18:12:17 EST


Hi Peter,

Can you test to see if this patch fixes the unwinder warning you
mentioned on IRC?


diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 246c67006ed0..344e4b332904 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -33,8 +33,9 @@
#define s3 ((16 + 2 + (3 * 256)) * 4)

/* register macros */
-#define CTX %rdi
+#define CTX %r12
#define RIO %rsi
+#define TMP %r11

#define RX0 %rax
#define RX1 %rbx
@@ -56,12 +57,12 @@
#define RX2bh %ch
#define RX3bh %dh

-#define RT0 %rbp
+#define RT0 %rdi
#define RT1 %rsi
#define RT2 %r8
#define RT3 %r9

-#define RT0d %ebp
+#define RT0d %edi
#define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d
@@ -120,13 +121,14 @@

ENTRY(__blowfish_enc_blk)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
*/
- movq %rbp, %r11;
+ movq %r12, TMP;

+ movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;

@@ -142,7 +144,7 @@ ENTRY(__blowfish_enc_blk)
round_enc(14);
add_roundkey_enc(16);

- movq %r11, %rbp;
+ movq TMP, %r12;

movq %r10, RIO;
test %cl, %cl;
@@ -157,12 +159,13 @@ ENDPROC(__blowfish_enc_blk)

ENTRY(blowfish_dec_blk)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
- movq %rbp, %r11;
+ movq %r12, TMP;

+ movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;

@@ -181,7 +184,7 @@ ENTRY(blowfish_dec_blk)
movq %r10, RIO;
write_block();

- movq %r11, %rbp;
+ movq TMP, %r12;

ret;
ENDPROC(blowfish_dec_blk)
@@ -298,18 +301,19 @@ ENDPROC(blowfish_dec_blk)

ENTRY(__blowfish_enc_blk_4way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
*/
- pushq %rbp;
+ pushq %r12;
pushq %rbx;
pushq %rcx;

+ movq %rdi, CTX
preload_roundkey_enc(0);

- movq %rsi, %r11;
+ movq %rsi, TMP;
movq %rdx, RIO;

read_block4();
@@ -324,39 +328,40 @@ ENTRY(__blowfish_enc_blk_4way)
round_enc4(14);
add_preloaded_roundkey4();

- popq %rbp;
- movq %r11, RIO;
+ popq %r12;
+ movq TMP, RIO;

- test %bpl, %bpl;
+ test %r12b, %r12b;
jnz .L__enc_xor4;

write_block4();

popq %rbx;
- popq %rbp;
+ popq %r12;
ret;

.L__enc_xor4:
xor_block4();

popq %rbx;
- popq %rbp;
+ popq %r12;
ret;
ENDPROC(__blowfish_enc_blk_4way)

ENTRY(blowfish_dec_blk_4way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
- pushq %rbp;
+ pushq %r12;
pushq %rbx;
- preload_roundkey_dec(17);

- movq %rsi, %r11;
+ movq %rdi, CTX;
+ movq %rsi, TMP
movq %rdx, RIO;

+ preload_roundkey_dec(17);
read_block4();

round_dec4(17);
@@ -369,11 +374,11 @@ ENTRY(blowfish_dec_blk_4way)
round_dec4(3);
add_preloaded_roundkey4();

- movq %r11, RIO;
+ movq TMP, RIO;
write_block4();

popq %rbx;
- popq %rbp;
+ popq %r12;

ret;
ENDPROC(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 310319c601ed..95ba6956a7f6 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -75,17 +75,17 @@
#define RCD1bh %dh

#define RT0 %rsi
-#define RT1 %rbp
+#define RT1 %r12
#define RT2 %r8

#define RT0d %esi
-#define RT1d %ebp
+#define RT1d %r12d
#define RT2d %r8d

#define RT2bl %r8b

#define RXOR %r9
-#define RRBP %r10
+#define RR12 %r10
#define RDST %r11

#define RXORd %r9d
@@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk)
* %rdx: src
* %rcx: bool xor
*/
- movq %rbp, RRBP;
+ movq %r12, RR12;

movq %rcx, RXOR;
movq %rsi, RDST;
@@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk)

enc_outunpack(mov, RT1);

- movq RRBP, %rbp;
+ movq RR12, %r12;
ret;

.L__enc_xor:
enc_outunpack(xor, RT1);

- movq RRBP, %rbp;
+ movq RR12, %r12;
ret;
ENDPROC(__camellia_enc_blk)

@@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk)
movl $24, RXORd;
cmovel RXORd, RT2d; /* max */

- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rsi, RDST;
movq %rdx, RIO;

@@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk)

dec_outunpack();

- movq RRBP, %rbp;
+ movq RR12, %r12;
ret;
ENDPROC(camellia_dec_blk)

@@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way)
*/
pushq %rbx;

- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rcx, RXOR;
movq %rsi, RDST;
movq %rdx, RIO;
@@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way)

enc_outunpack2(mov, RT2);

- movq RRBP, %rbp;
+ movq RR12, %r12;
popq %rbx;
ret;

.L__enc2_xor:
enc_outunpack2(xor, RT2);

- movq RRBP, %rbp;
+ movq RR12, %r12;
popq %rbx;
ret;
ENDPROC(__camellia_enc_blk_2way)
@@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way)
cmovel RXORd, RT2d; /* max */

movq %rbx, RXOR;
- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rsi, RDST;
movq %rdx, RIO;

@@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way)

dec_outunpack2();

- movq RRBP, %rbp;
+ movq RR12, %r12;
movq RXOR, %rbx;
ret;
ENDPROC(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b4a8806234ea..86107c961bb4 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
/**********************************************************************
16-way AVX cast5
**********************************************************************/
-#define CTX %rdi
+#define CTX %r15

#define RL1 %xmm0
#define RR1 %xmm1
@@ -70,8 +70,8 @@

#define RTMP %xmm15

-#define RID1 %rbp
-#define RID1d %ebp
+#define RID1 %rdi
+#define RID1d %edi
#define RID2 %rsi
#define RID2d %esi

@@ -226,7 +226,7 @@
.align 16
__cast5_enc_blk16:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RL1: blocks 1 and 2
* RR1: blocks 3 and 4
* RL2: blocks 5 and 6
@@ -246,9 +246,11 @@ __cast5_enc_blk16:
* RR4: encrypted blocks 15 and 16
*/

- pushq %rbp;
+ pushq %r15;
pushq %rbx;

+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -283,7 +285,7 @@ __cast5_enc_blk16:

.L__skip_enc:
popq %rbx;
- popq %rbp;
+ popq %r15;

vmovdqa .Lbswap_mask, RKM;

@@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16)
.align 16
__cast5_dec_blk16:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RL1: encrypted blocks 1 and 2
* RR1: encrypted blocks 3 and 4
* RL2: encrypted blocks 5 and 6
@@ -318,9 +320,11 @@ __cast5_dec_blk16:
* RR4: decrypted blocks 15 and 16
*/

- pushq %rbp;
+ pushq %r15;
pushq %rbx;

+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -356,7 +360,7 @@ __cast5_dec_blk16:

vmovdqa .Lbswap_mask, RKM;
popq %rbx;
- popq %rbp;
+ popq %r15;

outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16)

ENTRY(cast5_ecb_enc_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;

+ movq %rdi, CTX;
movq %rsi, %r11;

vmovdqu (0*4*4)(%rdx), RL1;
@@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way)
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);

+ popq %r15;
FRAME_END
ret;
ENDPROC(cast5_ecb_enc_16way)

ENTRY(cast5_ecb_dec_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/

FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
movq %rsi, %r11;

vmovdqu (0*4*4)(%rdx), RL1;
@@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way)
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);

+ popq %r15;
FRAME_END
ret;
ENDPROC(cast5_ecb_dec_16way)

ENTRY(cast5_cbc_dec_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15;

+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;

@@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way)
vmovdqu RR4, (6*16)(%r11);
vmovdqu RL4, (7*16)(%r11);

+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast5_cbc_dec_16way)

ENTRY(cast5_ctr_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: iv (big endian, 64bit)
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15;

+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;

@@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way)
vmovdqu RR4, (6*16)(%r11);
vmovdqu RL4, (7*16)(%r11);

+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 952d3156a933..7f30b6f0d72c 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
/**********************************************************************
8-way AVX cast6
**********************************************************************/
-#define CTX %rdi
+#define CTX %r15

#define RA1 %xmm0
#define RB1 %xmm1
@@ -70,8 +70,8 @@

#define RTMP %xmm15

-#define RID1 %rbp
-#define RID1d %ebp
+#define RID1 %rdi
+#define RID1d %edi
#define RID2 %rsi
#define RID2d %esi

@@ -264,15 +264,17 @@
.align 8
__cast6_enc_blk8:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/

- pushq %rbp;
+ pushq %r15;
pushq %rbx;

+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -297,7 +299,7 @@ __cast6_enc_blk8:
QBAR(11);

popq %rbx;
- popq %rbp;
+ popq %r15;

vmovdqa .Lbswap_mask, RKM;

@@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8)
.align 8
__cast6_dec_blk8:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/

- pushq %rbp;
+ pushq %r15;
pushq %rbx;

+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -343,7 +347,7 @@ __cast6_dec_blk8:
QBAR(0);

popq %rbx;
- popq %rbp;
+ popq %r15;

vmovdqa .Lbswap_mask, RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
@@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8)

ENTRY(cast6_ecb_enc_8way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;

+ movq %rdi, CTX;
movq %rsi, %r11;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way)

store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_ecb_enc_8way)

ENTRY(cast6_ecb_dec_8way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;

+ movq %rdi, CTX;
movq %rsi, %r11;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way)

store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_ecb_dec_8way)

ENTRY(cast6_cbc_dec_8way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15;

+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;

@@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way)

store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast6_cbc_dec_8way)
@@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way)
* %rcx: iv (little endian, 128bit)
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15

+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;

@@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way)

store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast6_ctr_8way)
@@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way)
* %rcx: iv (t â Îâ â GF(2ÂÂâ))
*/
FRAME_BEGIN
+ pushq %r15;

+ movq %rdi, CTX
movq %rsi, %r11;

/* regs <= src, dst <= IVs, regs <= regs xor IVs */
@@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_xts_enc_8way)
@@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way)
* %rcx: iv (t â Îâ â GF(2ÂÂâ))
*/
FRAME_BEGIN
+ pushq %r15;

+ movq %rdi, CTX
movq %rsi, %r11;

/* regs <= src, dst <= IVs, regs <= regs xor IVs */
@@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index f3e91647ca27..d69900d8713b 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -64,12 +64,12 @@
#define RW2bh %ch

#define RT0 %r15
-#define RT1 %rbp
+#define RT1 %rsi
#define RT2 %r14
#define RT3 %rdx

#define RT0d %r15d
-#define RT1d %ebp
+#define RT1d %esi
#define RT2d %r14d
#define RT3d %edx

@@ -177,12 +177,12 @@ ENTRY(des3_ede_x86_64_crypt_blk)
* %rsi: dst
* %rdx: src
*/
- pushq %rbp;
pushq %rbx;
pushq %r12;
pushq %r13;
pushq %r14;
pushq %r15;
+ pushq %rsi;

read_block(%rdx, RL0, RR0);
initial_permutation(RL0, RR0);
@@ -241,6 +241,8 @@ ENTRY(des3_ede_x86_64_crypt_blk)
round1(32+15, RL0, RR0, dummy2);

final_permutation(RR0, RL0);
+
+ popq %rsi
write_block(%rsi, RR0, RL0);

popq %r15;
@@ -248,7 +250,6 @@ ENTRY(des3_ede_x86_64_crypt_blk)
popq %r13;
popq %r12;
popq %rbx;
- popq %rbp;

ret;
ENDPROC(des3_ede_x86_64_crypt_blk)
@@ -432,12 +433,12 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
* %rdx: src (3 blocks)
*/

- pushq %rbp;
pushq %rbx;
pushq %r12;
pushq %r13;
pushq %r14;
pushq %r15;
+ pushq %rsi

/* load input */
movl 0 * 4(%rdx), RL0d;
@@ -520,6 +521,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR2d;
bswapl RL2d;

+ popq %rsi
movl RR0d, 0 * 4(%rsi);
movl RL0d, 1 * 4(%rsi);
movl RR1d, 2 * 4(%rsi);
@@ -532,7 +534,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
popq %r13;
popq %r12;
popq %rbx;
- popq %rbp;

ret;
ENDPROC(des3_ede_x86_64_crypt_blk_3way)
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1eab79c9ac48..2b22b69d5976 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -89,7 +89,7 @@
#define REG_RE %rdx
#define REG_RTA %r12
#define REG_RTB %rbx
-#define REG_T1 %ebp
+#define REG_T1 %r11d
#define xmm_mov vmovups
#define avx2_zeroupper vzeroupper
#define RND_F1 1
@@ -637,7 +637,7 @@ _loop3:
ENTRY(\name)

push %rbx
- push %rbp
+ push %r11
push %r12
push %r13
push %r14
@@ -673,7 +673,7 @@ _loop3:
pop %r14
pop %r13
pop %r12
- pop %rbp
+ pop %r11
pop %rbx

ret
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index a4109506a5e8..6204bd53528c 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -37,7 +37,7 @@
#define REG_A %ecx
#define REG_B %esi
#define REG_C %edi
-#define REG_D %ebp
+#define REG_D %r12d
#define REG_E %edx

#define REG_T1 %eax
@@ -74,10 +74,10 @@
ENTRY(\name)

push %rbx
- push %rbp
push %r12
+ push %rbp
+ mov %rsp, %rbp

- mov %rsp, %r12
sub $64, %rsp # allocate workspace
and $~15, %rsp # align stack

@@ -99,10 +99,9 @@
xor %rax, %rax
rep stosq

- mov %r12, %rsp # deallocate workspace
-
- pop %r12
+ mov %rbp, %rsp # deallocate workspace
pop %rbp
+ pop %r12
pop %rbx
ret

diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index e08888a1a5f2..001bbcf93c79 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP
c = %ecx
d = %r8d
e = %edx
-TBL = %rbp
+TBL = %r12
a = %eax
b = %ebx

@@ -350,13 +350,13 @@ a = TMP_
ENTRY(sha256_transform_avx)
.align 32
pushq %rbx
- pushq %rbp
+ pushq %r12
pushq %r13
pushq %r14
pushq %r15
- pushq %r12
+ pushq %rbp
+ movq %rsp, %rbp

- mov %rsp, %r12
subq $STACK_SIZE, %rsp # allocate stack space
and $~15, %rsp # align stack pointer

@@ -452,13 +452,12 @@ loop2:

done_hash:

- mov %r12, %rsp
-
- popq %r12
+ mov %rbp, %rsp
+ popq %rbp
popq %r15
popq %r14
popq %r13
- popq %rbp
+ popq %r12
popq %rbx
ret
ENDPROC(sha256_transform_avx)
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 89c8f09787d2..0325b5db3f83 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -99,7 +99,7 @@ e = %edx # clobbers NUM_BLKS
y3 = %esi # clobbers INP


-TBL = %rbp
+TBL = %r12 # clobbered by T1
SRND = CTX # SRND is same register as CTX

a = %eax
@@ -531,7 +531,6 @@ STACK_SIZE = _RSP + _RSP_SIZE
ENTRY(sha256_transform_rorx)
.align 32
pushq %rbx
- pushq %rbp
pushq %r12
pushq %r13
pushq %r14
@@ -568,8 +567,6 @@ ENTRY(sha256_transform_rorx)
mov CTX, _CTX(%rsp)

loop0:
- lea K256(%rip), TBL
-
## Load first 16 dwords from two blocks
VMOVDQ 0*32(INP),XTMP0
VMOVDQ 1*32(INP),XTMP1
@@ -597,18 +594,22 @@ last_block_enter:

.align 16
loop1:
+ lea K256(%rip), TBL
vpaddd 0*32(TBL, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 0*32

+ lea K256(%rip), TBL
vpaddd 1*32(TBL, SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 1*32

+ lea K256(%rip), TBL
vpaddd 2*32(TBL, SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 2*32

+ lea K256(%rip), TBL
vpaddd 3*32(TBL, SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
@@ -619,11 +620,17 @@ loop1:

loop2:
## Do last 16 rounds with no scheduling
+ lea K256(%rip), TBL
vpaddd 0*32(TBL, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 0*32
+
+ lea K256(%rip), TBL
vpaddd 1*32(TBL, SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+
+ lea K256(%rip), TBL
+
DO_4ROUNDS _XFER + 1*32
add $2*32, SRND

@@ -676,9 +683,6 @@ loop3:
ja done_hash

do_last_block:
- #### do last block
- lea K256(%rip), TBL
-
VMOVDQ 0*16(INP),XWORD0
VMOVDQ 1*16(INP),XWORD1
VMOVDQ 2*16(INP),XWORD2
@@ -718,7 +722,6 @@ done_hash:
popq %r14
popq %r13
popq %r12
- popq %rbp
popq %rbx
ret
ENDPROC(sha256_transform_rorx)
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 39b83c93e7fd..c6c05ed2c16a 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP
c = %ecx
d = %r8d
e = %edx
-TBL = %rbp
+TBL = %r12
a = %eax
b = %ebx

@@ -356,13 +356,13 @@ a = TMP_
ENTRY(sha256_transform_ssse3)
.align 32
pushq %rbx
- pushq %rbp
+ pushq %r12
pushq %r13
pushq %r14
pushq %r15
- pushq %r12
+ pushq %rbp
+ mov %rsp, %rbp

- mov %rsp, %r12
subq $STACK_SIZE, %rsp
and $~15, %rsp

@@ -462,13 +462,12 @@ loop2:

done_hash:

- mov %r12, %rsp
-
- popq %r12
+ mov %rbp, %rsp
+ popq %rbp
popq %r15
popq %r14
popq %r13
- popq %rbp
+ popq %r12
popq %rbx

ret
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 7f5f6c6ec72e..ed93dabb71bf 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -81,7 +81,7 @@ d = %r8
e = %rdx
y3 = %rsi

-TBL = %rbp
+TBL = %r12 # clobbered by y4

a = %rax
b = %rbx
@@ -101,6 +101,7 @@ y4 = %r12
# Local variables (stack frame)
XFER_SIZE = 4*8
SRND_SIZE = 1*8
+TBL_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
RSPSAVE_SIZE = 1*8
@@ -108,7 +109,8 @@ GPRSAVE_SIZE = 6*8

frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
+frame_TBL = frame_SRND + SRND_SIZE
+frame_INP = frame_TBL + TBL_SIZE
frame_INPEND = frame_INP + INP_SIZE
frame_RSPSAVE = frame_INPEND + INPEND_SIZE
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
@@ -601,7 +603,7 @@ ENTRY(sha512_transform_rorx)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK

loop0:
- lea K512(%rip), TBL
+ movq $K512, frame_TBL(%rsp)

## byte swap first 16 dwords
COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
@@ -616,39 +618,46 @@ loop0:

.align 16
loop1:
+ mov frame_TBL(%rsp), TBL
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED

+ mov frame_TBL(%rsp), TBL
vpaddq 1*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED

+ mov frame_TBL(%rsp), TBL
vpaddq 2*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED

+ mov frame_TBL(%rsp), TBL
vpaddq 3*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
- add $(4*32), TBL
FOUR_ROUNDS_AND_SCHED

+ addq $(4*32), frame_TBL(%rsp)
subq $1, frame_SRND(%rsp)
jne loop1

movq $2, frame_SRND(%rsp)
loop2:
+ mov frame_TBL(%rsp), TBL
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
DO_4ROUNDS
+
+ mov frame_TBL(%rsp), TBL
vpaddq 1*32(TBL), Y_1, XFER
vmovdqa XFER, frame_XFER(%rsp)
- add $(2*32), TBL
DO_4ROUNDS

vmovdqa Y_2, Y_0
vmovdqa Y_3, Y_1

+ add $(2*32), frame_TBL(%rsp)
subq $1, frame_SRND(%rsp)
jne loop2

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index b3f49d286348..73b471da3622 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -76,8 +76,8 @@
#define RT %xmm14
#define RR %xmm15

-#define RID1 %rbp
-#define RID1d %ebp
+#define RID1 %r13
+#define RID1d %r13d
#define RID2 %rsi
#define RID2d %esi

@@ -259,7 +259,7 @@ __twofish_enc_blk8:

vmovdqu w(CTX), RK1;

- pushq %rbp;
+ pushq %r13;
pushq %rbx;
pushq %rcx;

@@ -282,7 +282,7 @@ __twofish_enc_blk8:

popq %rcx;
popq %rbx;
- popq %rbp;
+ popq %r13;

outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
@@ -301,7 +301,7 @@ __twofish_dec_blk8:

vmovdqu (w+4*4)(CTX), RK1;

- pushq %rbp;
+ pushq %r13;
pushq %rbx;

inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
@@ -322,7 +322,7 @@ __twofish_dec_blk8:
vmovdqu (w)(CTX), RK1;

popq %rbx;
- popq %rbp;
+ popq %r13;

outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);