[RFC PATCH 2] [X86/mem] Handle unaligned case by avoiding store crossing cache line
From: ling . ma
Date:  Thu Oct 14 2010 - 15:42:32 EST
From: Ma Ling <ling.ma@xxxxxxxxx>
In this patch we manage to reduce penalty from crossing cache line
on some CPU archs. There are two crossing-cache-line cases:
read and write, but write is more expensive because of
no cache-way predication and read-for-ownership operations
on some archs, here we avoid sotre unaligned cases,
another reason is shift register will cause more penalty
on decode stages, so tolerate read.
Signed-off-by: Ma Ling <ling.ma@xxxxxxxxx>
---
In this version we append detail comments.
 arch/x86/lib/memcpy_64.S |   67 ++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 59 insertions(+), 8 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..9ec6694 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -43,9 +43,9 @@ ENTRY(memcpy)
 	movq %rdi, %rax
 
 	/*
-	 * Use 32bit CMP here to avoid long NOP padding.
+	 * Use 64bit CMP here to get 16bytes aligned on .Lcopy_forward_loop.
 	 */
-	cmp  $0x20, %edx
+	cmp  $0x28, %rdx
 	jb .Lhandle_tail
 
 	/*
@@ -54,7 +54,23 @@ ENTRY(memcpy)
 	 */
 	cmp  %dil, %sil
 	jl .Lcopy_backward
-	subl $0x20, %edx
+
+	/*
+	 * By overlap writing we force store to handle data
+         * with 8bytes aligned later. After that it never 
+         * cross cache line in one operation because cache line
+         * size should be multiple of 8bytes.
+	 */
+	movq (%rsi), %rcx
+	movq %rdi, %r8
+	addq $8, %rdi
+	andq $-8, %rdi
+	movq %rcx, (%r8)
+	subq %rdi, %r8
+	addq %r8, %rdx
+	subq %r8, %rsi
+	
+	subq $0x20, %rdx
 .Lcopy_forward_loop:
 	subq $0x20,	%rdx
 
@@ -74,20 +90,31 @@ ENTRY(memcpy)
 	leaq 4*8(%rdi),	%rdi
 	jae  .Lcopy_forward_loop
 	addq $0x20,	%rdx
-	jmp  .Lhandle_tail
+	jmp   .Lless_32bytes
 
+	.p2align 4
 .Lcopy_backward:
 	/*
 	 * Calculate copy position to tail.
 	 */
 	addq %rdx,	%rsi
 	addq %rdx,	%rdi
-	subq $0x20,	%rdx
+
 	/*
-	 * At most 3 ALU operations in one cycle,
-	 * so append NOPS in the same 16bytes trunk.
+	 * By overlap writing we force store to handle data
+         * with 8bytes aligned later. After that it never 
+         * cross cache line in one operation because cache line
+         * size should be multiple of 8bytes.
 	 */
-	.p2align 4
+	movq -8(%rsi), %rcx
+	movq %rdi, %r8
+	andq $-8, %rdi
+	movq %rcx, -8(%r8)
+	subq %rdi, %r8
+	subq %r8, %rdx
+	subq %r8, %rsi
+
+	subq $0x20,	%rdx
 .Lcopy_backward_loop:
 	subq $0x20,	%rdx
 	movq -1*8(%rsi),	%r8
@@ -108,7 +135,31 @@ ENTRY(memcpy)
 	addq $0x20,	%rdx
 	subq %rdx,	%rsi
 	subq %rdx,	%rdi
+	jmp   .Lless_32bytes
+
+	.p2align 4
 .Lhandle_tail:
+
+	cmpq $32,	%rdx
+	jb   .Lless_32bytes
+
+	/*
+	 * Move data from 32 bytes to 39 bytes.
+	 */
+	movq 0*8(%rsi), %rcx
+	movq 1*8(%rsi),	%r8
+	movq -3*8(%rsi, %rdx),	%r9
+	movq -2*8(%rsi, %rdx),	%r10
+	movq -1*8(%rsi, %rdx),	%r11
+	movq %rcx,	0*8(%rdi)
+	movq %r8,	1*8(%rdi)
+	movq %r9,	-3*8(%rdi, %rdx)
+	movq %r10,	-2*8(%rdi, %rdx)
+	movq %r11,	-1*8(%rdi, %rdx)
+	retq
+
+	.p2align 4
+.Lless_32bytes:
 	cmpq $16,	%rdx
 	jb   .Lless_16bytes
 
-- 
1.6.5.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/