[PATCH -tip] Revert x86, mem: Optimize memcpy by avoiding memoryfalse dependece

From: Mathieu Desnoyers
Date: Thu Jul 08 2010 - 02:27:41 EST


* Ma, Ling (ling.ma@xxxxxxxxx) wrote:
> Hi Mathieu
>
> I did git revert as bellow
> Revert "x86, mem: Optimize memcpy by avoiding memory false dependece"
>
> This reverts commit 77ff5e49c20bb72a99a12bfb00dbf90b2302f087.
> And I found the machine will crash with your config file
>
> Could you revert the patch : 77ff5e49c20bb72a99a12bfb00dbf90b2302f087 on your
> machine, and check it again ?

Hi Ling,

I just successfully booted my machine 3 times in a row with a revert of commit
a1e5278e40f16a4611264f8da9e557c16cb6f6ed from -tip. This is actually the same as
reverting commit 77ff5e49c20bb72a99a12bfb00dbf90b2302f087, as
a1e5278e40f16a4611264f8da9e557c16cb6f6ed is simply a merge of that commit (the
diffs are exactly the same).

Without this revert, the machine gets to:

Loading Linux 2.6.35-rc4-tip-trace+ ...

and reboots all by itself. I've let it reboot 3 times in a row just to make
sure.

So I am therefore certain that the faulty commit is
"x86, mem: Optimize memcpy by avoiding memory false dependece"
identified by commit id 77ff5e49c20bb72a99a12bfb00dbf90b2302f087, merged by
commit a1e5278e40f16a4611264f8da9e557c16cb6f6ed into -tip.

For the sake of the commit log, the affected machine is an Intel Xeon E5405.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
CC: "Ma, Ling" <ling.ma@xxxxxxxxx>
CC: "mingo@xxxxxxx" <mingo@xxxxxxx>
CC: "H. Peter Anvin" <hpa@xxxxxxxxx>
---
arch/x86/lib/memcpy_64.S | 156 ++++++++++++++++-------------------------------
1 file changed, 54 insertions(+), 102 deletions(-)

Index: linux.trees.git/arch/x86/lib/memcpy_64.S
===================================================================
--- linux.trees.git.orig/arch/x86/lib/memcpy_64.S 2010-07-08 02:11:58.000000000 -0400
+++ linux.trees.git/arch/x86/lib/memcpy_64.S 2010-07-08 02:20:48.000000000 -0400
@@ -40,132 +40,84 @@
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
- movq %rdi, %rax

/*
- * Use 32bit CMP here to avoid long NOP padding.
+ * Put the number of full 64-byte blocks into %ecx.
+ * Tail portion is handled at the end:
*/
- cmp $0x20, %edx
- jb .Lhandle_tail
+ movq %rdi, %rax
+ movl %edx, %ecx
+ shrl $6, %ecx
+ jz .Lhandle_tail

+ .p2align 4
+.Lloop_64:
/*
- * We check whether memory false dependece could occur,
- * then jump to corresponding copy mode.
+ * We decrement the loop index here - and the zero-flag is
+ * checked at the end of the loop (instructions inbetween do
+ * not change the zero flag):
*/
- cmp %dil, %sil
- jl .Lcopy_backward
- subl $0x20, %edx
-.Lcopy_forward_loop:
- subq $0x20, %rdx
+ decl %ecx

/*
- * Move in blocks of 4x8 bytes:
+ * Move in blocks of 4x16 bytes:
*/
- movq 0*8(%rsi), %r8
- movq 1*8(%rsi), %r9
- movq 2*8(%rsi), %r10
- movq 3*8(%rsi), %r11
- leaq 4*8(%rsi), %rsi
-
- movq %r8, 0*8(%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, 2*8(%rdi)
- movq %r11, 3*8(%rdi)
- leaq 4*8(%rdi), %rdi
- jae .Lcopy_forward_loop
- addq $0x20, %rdx
- jmp .Lhandle_tail
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r8, 1*8(%rdi)

-.Lcopy_backward:
- /*
- * Calculate copy position to tail.
- */
- addq %rdx, %rsi
- addq %rdx, %rdi
- subq $0x20, %rdx
- /*
- * At most 3 ALU operations in one cycle,
- * so append NOPS in the same 16bytes trunk.
- */
- .p2align 4
-.Lcopy_backward_loop:
- subq $0x20, %rdx
- movq -1*8(%rsi), %r8
- movq -2*8(%rsi), %r9
- movq -3*8(%rsi), %r10
- movq -4*8(%rsi), %r11
- leaq -4*8(%rsi), %rsi
- movq %r8, -1*8(%rdi)
- movq %r9, -2*8(%rdi)
- movq %r10, -3*8(%rdi)
- movq %r11, -4*8(%rdi)
- leaq -4*8(%rdi), %rdi
- jae .Lcopy_backward_loop
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r10
+ movq %r9, 2*8(%rdi)
+ movq %r10, 3*8(%rdi)
+
+ movq 4*8(%rsi), %r11
+ movq 5*8(%rsi), %r8
+ movq %r11, 4*8(%rdi)
+ movq %r8, 5*8(%rdi)
+
+ movq 6*8(%rsi), %r9
+ movq 7*8(%rsi), %r10
+ movq %r9, 6*8(%rdi)
+ movq %r10, 7*8(%rdi)
+
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
+
+ jnz .Lloop_64

- /*
- * Calculate copy position to head.
- */
- addq $0x20, %rdx
- subq %rdx, %rsi
- subq %rdx, %rdi
.Lhandle_tail:
- cmpq $16, %rdx
- jb .Lless_16bytes
+ movl %edx, %ecx
+ andl $63, %ecx
+ shrl $3, %ecx
+ jz .Lhandle_7

- /*
- * Move data from 16 bytes to 31 bytes.
- */
- movq 0*8(%rsi), %r8
- movq 1*8(%rsi), %r9
- movq -2*8(%rsi, %rdx), %r10
- movq -1*8(%rsi, %rdx), %r11
- movq %r8, 0*8(%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, -2*8(%rdi, %rdx)
- movq %r11, -1*8(%rdi, %rdx)
- retq
.p2align 4
-.Lless_16bytes:
- cmpq $8, %rdx
- jb .Lless_8bytes
- /*
- * Move data from 8 bytes to 15 bytes.
- */
- movq 0*8(%rsi), %r8
- movq -1*8(%rsi, %rdx), %r9
- movq %r8, 0*8(%rdi)
- movq %r9, -1*8(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_8bytes:
- cmpq $4, %rdx
- jb .Lless_3bytes
+.Lloop_8:
+ decl %ecx
+ movq (%rsi), %r8
+ movq %r8, (%rdi)
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx, %ecx
+ andl $7, %ecx
+ jz .Lend

- /*
- * Move data from 4 bytes to 7 bytes.
- */
- movl (%rsi), %ecx
- movl -4(%rsi, %rdx), %r8d
- movl %ecx, (%rdi)
- movl %r8d, -4(%rdi, %rdx)
- retq
.p2align 4
-.Lless_3bytes:
- cmpl $0, %edx
- je .Lend
- /*
- * Move data from 1 bytes to 3 bytes.
- */
.Lloop_1:
movb (%rsi), %r8b
movb %r8b, (%rdi)
incq %rdi
incq %rsi
- decl %edx
+ decl %ecx
jnz .Lloop_1

.Lend:
- retq
+ ret
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)

--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/