[PATCH] powerpc32: use stmw/lmw for non volatile registers save/restore

From: Christophe Leroy
Date: Mon May 23 2016 - 04:46:43 EST


lmw/stmw have a 1 cycle (2 cycles for lmw on some ppc) in addition
and implies serialising, however it reduces the amount of instructions
hence the amount of instruction fetch compared to the equivalent
operation with several lzw/stw. It means less pressure on cache and
less fetching delays on slow memory.
When we transfer 20 registers, it is worth it.
gcc uses stmw/lmw at function entry/exit to save/restore non
volatile register, so lets also do it that way.

On powerpc64, we can't use lmw/stmw as it only handles 32 bits, so
we move longjmp() and setjmp() from misc.S to misc_64.S, and we
write a 32 bits version in misc_32.S using stmw/lmw

Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxx>
---
The patch goes on top of "powerpc: inline current_stack_pointer()" or
requires trivial manual merge in arch/powerpc/kernel/misc.S

arch/powerpc/include/asm/ppc_asm.h | 6 ++--
arch/powerpc/kernel/misc.S | 61 --------------------------------------
arch/powerpc/kernel/misc_32.S | 22 ++++++++++++++
arch/powerpc/kernel/misc_64.S | 61 ++++++++++++++++++++++++++++++++++++++
4 files changed, 85 insertions(+), 65 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 2b31632..e29b649 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -82,10 +82,8 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
#else
#define SAVE_GPR(n, base) stw n,GPR0+4*(n)(base)
#define REST_GPR(n, base) lwz n,GPR0+4*(n)(base)
-#define SAVE_NVGPRS(base) SAVE_GPR(13, base); SAVE_8GPRS(14, base); \
- SAVE_10GPRS(22, base)
-#define REST_NVGPRS(base) REST_GPR(13, base); REST_8GPRS(14, base); \
- REST_10GPRS(22, base)
+#define SAVE_NVGPRS(base) stmw 13, GPR0+4*13(base)
+#define REST_NVGPRS(base) lmw 13, GPR0+4*13(base)
#endif

#define SAVE_2GPRS(n, base) SAVE_GPR(n, base); SAVE_GPR(n+1, base)
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 7ce26d4..9de71d8 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -53,64 +53,3 @@ _GLOBAL(add_reloc_offset)

.align 3
2: PPC_LONG 1b
-
-_GLOBAL(setjmp)
- mflr r0
- PPC_STL r0,0(r3)
- PPC_STL r1,SZL(r3)
- PPC_STL r2,2*SZL(r3)
- mfcr r0
- PPC_STL r0,3*SZL(r3)
- PPC_STL r13,4*SZL(r3)
- PPC_STL r14,5*SZL(r3)
- PPC_STL r15,6*SZL(r3)
- PPC_STL r16,7*SZL(r3)
- PPC_STL r17,8*SZL(r3)
- PPC_STL r18,9*SZL(r3)
- PPC_STL r19,10*SZL(r3)
- PPC_STL r20,11*SZL(r3)
- PPC_STL r21,12*SZL(r3)
- PPC_STL r22,13*SZL(r3)
- PPC_STL r23,14*SZL(r3)
- PPC_STL r24,15*SZL(r3)
- PPC_STL r25,16*SZL(r3)
- PPC_STL r26,17*SZL(r3)
- PPC_STL r27,18*SZL(r3)
- PPC_STL r28,19*SZL(r3)
- PPC_STL r29,20*SZL(r3)
- PPC_STL r30,21*SZL(r3)
- PPC_STL r31,22*SZL(r3)
- li r3,0
- blr
-
-_GLOBAL(longjmp)
- PPC_LCMPI r4,0
- bne 1f
- li r4,1
-1: PPC_LL r13,4*SZL(r3)
- PPC_LL r14,5*SZL(r3)
- PPC_LL r15,6*SZL(r3)
- PPC_LL r16,7*SZL(r3)
- PPC_LL r17,8*SZL(r3)
- PPC_LL r18,9*SZL(r3)
- PPC_LL r19,10*SZL(r3)
- PPC_LL r20,11*SZL(r3)
- PPC_LL r21,12*SZL(r3)
- PPC_LL r22,13*SZL(r3)
- PPC_LL r23,14*SZL(r3)
- PPC_LL r24,15*SZL(r3)
- PPC_LL r25,16*SZL(r3)
- PPC_LL r26,17*SZL(r3)
- PPC_LL r27,18*SZL(r3)
- PPC_LL r28,19*SZL(r3)
- PPC_LL r29,20*SZL(r3)
- PPC_LL r30,21*SZL(r3)
- PPC_LL r31,22*SZL(r3)
- PPC_LL r0,3*SZL(r3)
- mtcrf 0x38,r0
- PPC_LL r0,0(r3)
- PPC_LL r1,SZL(r3)
- PPC_LL r2,2*SZL(r3)
- mtlr r0
- mr r3,r4
- blr
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index d9c912b..de419e9 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -1086,3 +1086,25 @@ relocate_new_kernel_end:
relocate_new_kernel_size:
.long relocate_new_kernel_end - relocate_new_kernel
#endif
+
+_GLOBAL(setjmp)
+ mflr r0
+ li r3, 0
+ stw r0, 0(r3)
+ stw r1, 4(r3)
+ stw r2, 8(r3)
+ mfcr r12
+ stmw r12, 12(r3)
+ blr
+
+_GLOBAL(longjmp)
+ lwz r0, 0(r3)
+ lwz r1, 4(r3)
+ lwz r2, 8(r3)
+ lmw r12, 12(r3)
+ mtcrf 0x38, r12
+ mtlr r0
+ mr. r3, r4
+ bnelr
+ li r3, 1
+ blr
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index f28754c..7e25249 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -701,3 +701,64 @@ _GLOBAL(kexec_sequence)
li r5,0
blr /* image->start(physid, image->start, 0); */
#endif /* CONFIG_KEXEC */
+
+_GLOBAL(setjmp)
+ mflr r0
+ PPC_STL r0,0(r3)
+ PPC_STL r1,SZL(r3)
+ PPC_STL r2,2*SZL(r3)
+ mfcr r0
+ PPC_STL r0,3*SZL(r3)
+ PPC_STL r13,4*SZL(r3)
+ PPC_STL r14,5*SZL(r3)
+ PPC_STL r15,6*SZL(r3)
+ PPC_STL r16,7*SZL(r3)
+ PPC_STL r17,8*SZL(r3)
+ PPC_STL r18,9*SZL(r3)
+ PPC_STL r19,10*SZL(r3)
+ PPC_STL r20,11*SZL(r3)
+ PPC_STL r21,12*SZL(r3)
+ PPC_STL r22,13*SZL(r3)
+ PPC_STL r23,14*SZL(r3)
+ PPC_STL r24,15*SZL(r3)
+ PPC_STL r25,16*SZL(r3)
+ PPC_STL r26,17*SZL(r3)
+ PPC_STL r27,18*SZL(r3)
+ PPC_STL r28,19*SZL(r3)
+ PPC_STL r29,20*SZL(r3)
+ PPC_STL r30,21*SZL(r3)
+ PPC_STL r31,22*SZL(r3)
+ li r3,0
+ blr
+
+_GLOBAL(longjmp)
+ PPC_LCMPI r4,0
+ bne 1f
+ li r4,1
+1: PPC_LL r13,4*SZL(r3)
+ PPC_LL r14,5*SZL(r3)
+ PPC_LL r15,6*SZL(r3)
+ PPC_LL r16,7*SZL(r3)
+ PPC_LL r17,8*SZL(r3)
+ PPC_LL r18,9*SZL(r3)
+ PPC_LL r19,10*SZL(r3)
+ PPC_LL r20,11*SZL(r3)
+ PPC_LL r21,12*SZL(r3)
+ PPC_LL r22,13*SZL(r3)
+ PPC_LL r23,14*SZL(r3)
+ PPC_LL r24,15*SZL(r3)
+ PPC_LL r25,16*SZL(r3)
+ PPC_LL r26,17*SZL(r3)
+ PPC_LL r27,18*SZL(r3)
+ PPC_LL r28,19*SZL(r3)
+ PPC_LL r29,20*SZL(r3)
+ PPC_LL r30,21*SZL(r3)
+ PPC_LL r31,22*SZL(r3)
+ PPC_LL r0,3*SZL(r3)
+ mtcrf 0x38,r0
+ PPC_LL r0,0(r3)
+ PPC_LL r1,SZL(r3)
+ PPC_LL r2,2*SZL(r3)
+ mtlr r0
+ mr r3,r4
+ blr
--
2.1.0