[PATCH 11/12] x86/crypto: Fix RBP usage in sha512-avx2-asm.S

From: Josh Poimboeuf
Date: Tue Aug 29 2017 - 14:07:12 EST


Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R12 instead of RBP for the TBL register. Since R12 is also used as
another temporary register (T1), it gets clobbered in each round of
computation. So the TBL value needs to be freshly reloaded into R12
each time it's used. Since the value of TBL can change, store its
permanent value on the stack at the frame_TBL offset.

Also remove the unused y4 variable.

Reported-by: Eric Biggers <ebiggers3@xxxxxxxxx>
Reported-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
---
arch/x86/crypto/sha512-avx2-asm.S | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 7f5f6c6ec72e..37cfc2004abd 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -81,7 +81,7 @@ d = %r8
e = %rdx
y3 = %rsi

-TBL = %rbp
+TBL = %r12 # clobbered by T1

a = %rax
b = %rbx
@@ -96,11 +96,10 @@ y0 = %r13
y1 = %r14
y2 = %r15

-y4 = %r12
-
# Local variables (stack frame)
XFER_SIZE = 4*8
SRND_SIZE = 1*8
+TBL_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
RSPSAVE_SIZE = 1*8
@@ -108,7 +107,8 @@ GPRSAVE_SIZE = 6*8

frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
+frame_TBL = frame_SRND + SRND_SIZE
+frame_INP = frame_TBL + TBL_SIZE
frame_INPEND = frame_INP + INP_SIZE
frame_RSPSAVE = frame_INPEND + INPEND_SIZE
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
@@ -601,7 +601,7 @@ ENTRY(sha512_transform_rorx)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK

loop0:
- lea K512(%rip), TBL
+ movq $K512, frame_TBL(%rsp)

## byte swap first 16 dwords
COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
@@ -616,39 +616,46 @@ loop0:

.align 16
loop1:
+ mov frame_TBL(%rsp), TBL
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED

+ mov frame_TBL(%rsp), TBL
vpaddq 1*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED

+ mov frame_TBL(%rsp), TBL
vpaddq 2*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED

+ mov frame_TBL(%rsp), TBL
vpaddq 3*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
- add $(4*32), TBL
FOUR_ROUNDS_AND_SCHED

+ addq $(4*32), frame_TBL(%rsp)
subq $1, frame_SRND(%rsp)
jne loop1

movq $2, frame_SRND(%rsp)
loop2:
+ mov frame_TBL(%rsp), TBL
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
DO_4ROUNDS
+
+ mov frame_TBL(%rsp), TBL
vpaddq 1*32(TBL), Y_1, XFER
vmovdqa XFER, frame_XFER(%rsp)
- add $(2*32), TBL
DO_4ROUNDS

vmovdqa Y_2, Y_0
vmovdqa Y_3, Y_1

+ add $(2*32), frame_TBL(%rsp)
subq $1, frame_SRND(%rsp)
jne loop2

--
2.13.5