RE: [PATCH RFC] [X86] Optimize memcpy by avoiding memory falsedependece

From: Ma, Ling
Date: Wed Jun 30 2010 - 05:00:13 EST


Hi Ingo

We extract some compared results by attachment micro-benchmarks on Corei7.
(gcc -O2 -o memcpy-kernel memcpy-kernel.c )

LAT: Len 127, alignment 4/16: improvement: 2X
LAT: Len 127, alignment 0/16: improvement: 2X
LAT: Len 1024, alignment 4/16: improvement: 1.5X
LAT: Len 1024, alignment 0/ 0: no change
LAT: Len 4096, alignment 4/16: improvement :1.6X
LAT: Len 4096, alignment 0/ 8: improvement:1.37X
LAT: Len 8192, alignment 16/ 0: no change
LAT: Len 8192, alignment 0/16: improvement 1.45X

Any comments from you ?

Thanks
Ling

> -----Original Message-----
> From: Ma, Ling
> Sent: Tuesday, June 29, 2010 3:24 AM
> To: mingo@xxxxxxx
> Cc: hpa@xxxxxxxxx; tglx@xxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Ma,
> Ling
> Subject: [PATCH RFC] [X86] Optimize memcpy by avoiding memory false
> dependece
>
> From: Ma Ling <ling.ma@xxxxxxxxx>
>
> All read operations after allocation stage can run speculatively,
> all write operation will run in program order, and if addresses are
> different read may run before older write operation, otherwise wait
> until write commit. However CPU don't check each address bit,
> so read could fail to recognize different address even they
> are in different page.For example if rsi is 0xf004, rdi is 0xe008,
> in following operation there will generate big performance latency.
> 1. movq (%rsi), %rax
> 2. movq %rax, (%rdi)
> 3. movq 8(%rsi), %rax
> 4. movq %rax, 8(%rdi)
>
> If %rsi and rdi were in really the same meory page, there are TRUE
> read-after-write dependence because instruction 2 write 0x008 and
> instruction 3 read 0x00c, the two address are overlap partially.
> Actually there are in different page and no any issues,
> but without checking each address bit CPU could think they are
> in the same page, and instruction 3 have to wait for instruction 2
> to write data into cache from write buffer, then load data from cache,
> the cost time read spent is equal to mfence instruction. We may avoid it by
> tuning operation sequence as follow.
>
> 1. movq 8(%rsi), %rax
> 2. movq %rax, 8(%rdi)
> 3. movq (%rsi), %rax
> 4. movq %rax, (%rdi)
>
> Instruction 3 read 0x004, instruction 2 write address 0x010, no any
> dependence.
> At last on Core2 we gain 1.83x speedup compared with original instruction
> sequence.
> In this patch we first handle small size(less 20bytes), then jump to
> different copy mode. Based on our micro-benchmark small bytes from 1 to 127
> bytes,
> we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on
> Corei7.
> (We use our micro-benchmark, and will do further test according to your
> requirment)
>
> Thanks
> Ling
>
> ---
> arch/x86/lib/memcpy_64.S | 158
> ++++++++++++++++++++++++++++++----------------
> 1 files changed, 103 insertions(+), 55 deletions(-)
>
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index f82e884..5902438 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -40,84 +40,132 @@
> ENTRY(__memcpy)
> ENTRY(memcpy)
> CFI_STARTPROC
> + movq %rdi, %rax
>
> /*
> - * Put the number of full 64-byte blocks into %ecx.
> - * Tail portion is handled at the end:
> + * Use 32bit CMP here to avoid long NOP padding.
> */
> - movq %rdi, %rax
> - movl %edx, %ecx
> - shrl $6, %ecx
> - jz .Lhandle_tail
> + cmp $0x20, %edx
> + jb .Lhandle_tail
>
> - .p2align 4
> -.Lloop_64:
> /*
> - * We decrement the loop index here - and the zero-flag is
> - * checked at the end of the loop (instructions inbetween do
> - * not change the zero flag):
> + * We check whether memory false dependece could occur,
> + * then jump to corresponding copy mode.
> */
> - decl %ecx
> + cmp %dil, %sil
> + jl .Lcopy_backward
> + subl $0x20, %edx
> +.Lcopy_forward_loop:
> + subq $0x20, %rdx
>
> /*
> - * Move in blocks of 4x16 bytes:
> + * Move in blocks of 4x8 bytes:
> */
> - movq 0*8(%rsi), %r11
> - movq 1*8(%rsi), %r8
> - movq %r11, 0*8(%rdi)
> - movq %r8, 1*8(%rdi)
> -
> - movq 2*8(%rsi), %r9
> - movq 3*8(%rsi), %r10
> - movq %r9, 2*8(%rdi)
> - movq %r10, 3*8(%rdi)
> -
> - movq 4*8(%rsi), %r11
> - movq 5*8(%rsi), %r8
> - movq %r11, 4*8(%rdi)
> - movq %r8, 5*8(%rdi)
> -
> - movq 6*8(%rsi), %r9
> - movq 7*8(%rsi), %r10
> - movq %r9, 6*8(%rdi)
> - movq %r10, 7*8(%rdi)
> -
> - leaq 64(%rsi), %rsi
> - leaq 64(%rdi), %rdi
> -
> - jnz .Lloop_64
> + movq 0*8(%rsi), %r8
> + movq 1*8(%rsi), %r9
> + movq 2*8(%rsi), %r10
> + movq 3*8(%rsi), %r11
> + leaq 4*8(%rsi), %rsi
> +
> + movq %r8, 0*8(%rdi)
> + movq %r9, 1*8(%rdi)
> + movq %r10, 2*8(%rdi)
> + movq %r11, 3*8(%rdi)
> + leaq 4*8(%rdi), %rdi
> + jae .Lcopy_forward_loop
> + addq $0x20, %rdx
> + jmp .Lhandle_tail
> +
> +.Lcopy_backward:
> + /*
> + * Calculate copy position to tail.
> + */
> + addq %rdx, %rsi
> + addq %rdx, %rdi
> + subq $0x20, %rdx
> + /*
> + * At most 3 ALU operations in one cycle,
> + * so append NOPS in the same 16bytes trunk.
> + */
> + .p2align 4
> +.Lcopy_backward_loop:
> + subq $0x20, %rdx
> + movq -1*8(%rsi), %r8
> + movq -2*8(%rsi), %r9
> + movq -3*8(%rsi), %r10
> + movq -4*8(%rsi), %r11
> + leaq -4*8(%rsi), %rsi
> + movq %r8, -1*8(%rdi)
> + movq %r9, -2*8(%rdi)
> + movq %r10, -3*8(%rdi)
> + movq %r11, -4*8(%rdi)
> + leaq -4*8(%rdi), %rdi
> + jae .Lcopy_backward_loop
>
> + /*
> + * Calculate copy position to head.
> + */
> + addq $0x20, %rdx
> + subq %rdx, %rsi
> + subq %rdx, %rdi
> .Lhandle_tail:
> - movl %edx, %ecx
> - andl $63, %ecx
> - shrl $3, %ecx
> - jz .Lhandle_7
> + cmpq $16, %rdx
> + jb .Lless_16bytes
>
> + /*
> + * Move data from 16 bytes to 31 bytes.
> + */
> + movq 0*8(%rsi), %r8
> + movq 1*8(%rsi), %r9
> + movq -2*8(%rsi, %rdx), %r10
> + movq -1*8(%rsi, %rdx), %r11
> + movq %r8, 0*8(%rdi)
> + movq %r9, 1*8(%rdi)
> + movq %r10, -2*8(%rdi, %rdx)
> + movq %r11, -1*8(%rdi, %rdx)
> + retq
> .p2align 4
> -.Lloop_8:
> - decl %ecx
> - movq (%rsi), %r8
> - movq %r8, (%rdi)
> - leaq 8(%rdi), %rdi
> - leaq 8(%rsi), %rsi
> - jnz .Lloop_8
> -
> -.Lhandle_7:
> - movl %edx, %ecx
> - andl $7, %ecx
> - jz .Lend
> +.Lless_16bytes:
> + cmpq $8, %rdx
> + jb .Lless_8bytes
> + /*
> + * Move data from 8 bytes to 15 bytes.
> + */
> + movq 0*8(%rsi), %r8
> + movq -1*8(%rsi, %rdx), %r9
> + movq %r8, 0*8(%rdi)
> + movq %r9, -1*8(%rdi, %rdx)
> + retq
> + .p2align 4
> +.Lless_8bytes:
> + cmpq $4, %rdx
> + jb .Lless_3bytes
>
> + /*
> + * Move data from 4 bytes to 7 bytes.
> + */
> + movl (%rsi), %ecx
> + movl -4(%rsi, %rdx), %r8d
> + movl %ecx, (%rdi)
> + movl %r8d, -4(%rdi, %rdx)
> + retq
> .p2align 4
> +.Lless_3bytes:
> + cmpl $0, %edx
> + je .Lend
> + /*
> + * Move data from 1 bytes to 3 bytes.
> + */
> .Lloop_1:
> movb (%rsi), %r8b
> movb %r8b, (%rdi)
> incq %rdi
> incq %rsi
> - decl %ecx
> + decl %edx
> jnz .Lloop_1
>
> .Lend:
> - ret
> + retq
> CFI_ENDPROC
> ENDPROC(memcpy)
> ENDPROC(__memcpy)
> --
> 1.6.5.2

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 1000
#define MAXCOPYSIZE (1024 * 1024 * 100)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)

#define HP_TIMING_BEST(best_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
if (best_time > tmptime) \
best_time = tmptime; \
} \
while (0)


void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
size_t len)
{
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t best_time = ~ (hp_timing_t) 0;
size_t i,j;

for (i = 0; i < repeat_one_test; ++i)
{
HP_TIMING_NOW (start);
do_memcpy ( dst, src, len);
HP_TIMING_NOW (stop);
HP_TIMING_BEST (best_time, start, stop);
}

printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;

s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);


printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = memcpy_orig;
do_one_test (s2, s1, len);
do_memcpy = memcpy_new;
do_one_test (s2, s1, len);

putchar ('\n');
}

static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);

for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}

}

void memset_c(char *dst, char *src, int len)
{
__asm__("mov %rdx, %rcx");
__asm__("shr $3, %rcx");
__asm__("rep stosq");
}
void memset_2(char *dst, char *src, int len)
{
__asm__("sub $128, %rdx");
__asm__("1:");
__asm__("sub $128, %rdx");
__asm__("movdqa %xmm0, (%rdi)");
__asm__("movdqa %xmm0, 16(%rdi)");
__asm__("movdqa %xmm0, 32(%rdi)");
__asm__("movdqa %xmm0, 48(%rdi)");
__asm__("movdqa %xmm0, 64(%rdi)");
__asm__("movdqa %xmm0, 80(%rdi)");
__asm__("movdqa %xmm0, 96(%rdi)");
__asm__("movdqa %xmm0, 112(%rdi)");
__asm__("jae 1b");

}

void memcpy_new(char *dst, char *src, int len)
{

__asm__("movq %rdi, %rax");
__asm__("cmp $0x20, %edx");
__asm__("jb 1f");

/*
* We check whether memory false dependece could occur,
* then jump to corresponding copy mode.
*/
__asm__("cmp %dil, %sil");
__asm__("jl 2f");
__asm__("subl $0x20, %edx");
__asm__("3:");
__asm__("subq $0x20, %rdx");

/*
* Move in blocks of 4x8 bytes:
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq 1*8(%rsi), %r9");
__asm__("movq 2*8(%rsi), %r10");
__asm__("movq 3*8(%rsi), %r11");
__asm__("leaq 4*8(%rsi), %rsi");

__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, 1*8(%rdi)");
__asm__("movq %r10, 2*8(%rdi)");
__asm__("movq %r11, 3*8(%rdi)");
__asm__("leaq 4*8(%rdi), %rdi");
__asm__("jae 3b");
__asm__("addq $0x20, %rdx");
__asm__("jmp 1f");

__asm__("2:");
/*
* Calculate copy position to tail.
*/
__asm__("addq %rdx, %rsi");
__asm__("addq %rdx, %rdi");
__asm__("subq $0x20, %rdx");
__asm__(".p2align 4");
__asm__("4:");
__asm__("subq $0x20, %rdx");
__asm__("movq -1*8(%rsi), %r8");
__asm__("movq -2*8(%rsi), %r9");
__asm__("movq -3*8(%rsi), %r10");
__asm__("movq -4*8(%rsi), %r11");
__asm__("leaq -4*8(%rsi), %rsi");
__asm__("movq %r8, -1*8(%rdi)");
__asm__("movq %r9, -2*8(%rdi)");
__asm__("movq %r10, -3*8(%rdi)");
__asm__("movq %r11, -4*8(%rdi)");
__asm__("leaq -4*8(%rdi), %rdi");
__asm__("jae 4b");

/*
* Calculate copy position to head.
*/
__asm__("addq $0x20, %rdx");
__asm__("subq %rdx, %rsi");
__asm__("subq %rdx, %rdi");
__asm__("1:");
__asm__("cmpq $16, %rdx");
__asm__("jb 5f");

/*
* Move data from 16 bytes to 31 bytes.
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq 1*8(%rsi), %r9");
__asm__("movq -2*8(%rsi, %rdx), %r10");
__asm__("movq -1*8(%rsi, %rdx), %r11");
__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, 1*8(%rdi)");
__asm__("movq %r10, -2*8(%rdi, %rdx)");
__asm__("movq %r11, -1*8(%rdi, %rdx)");
__asm__("retq");
__asm__(".p2align 4");
__asm__("5:");
__asm__("cmpq $8, %rdx");
__asm__("jb 6f");
/*
* Move data from 8 bytes to 15 bytes.
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq -1*8(%rsi, %rdx), %r9");
__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, -1*8(%rdi, %rdx)");
__asm__("retq");
__asm__(".p2align 4");
__asm__("6:");
__asm__("cmpq $4, %rdx");
__asm__("jb 7f");

/*
* Move data from 4 bytes to 7 bytes.
*/
__asm__("movl (%rsi), %ecx");
__asm__("movl -4(%rsi, %rdx), %r8d");
__asm__("movl %ecx, (%rdi)");
__asm__("movl %r8d, -4(%rdi, %rdx)");
__asm__("retq");
__asm__(".p2align 4");
__asm__("7:");
__asm__("cmpl $0, %edx");
__asm__("je 8f");
/*
* Move data from 1 bytes to 3 bytes.
*/
__asm__("9:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %edx");
__asm__("jnz 9b");

__asm__("8:");
}

void memcpy_orig(char *dst, char *src, int len)
{
__asm__("mov %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");
__asm__("mov $0x80, %r8d "); /*aligned case for loop 1 */
__asm__("1:");
__asm__("decl %ecx");
__asm__("mov 0*8(%rsi), %r11");
__asm__("mov 1*8(%rdi), %r8");
__asm__("mov %r11, 0*8(%rdi)");
__asm__("mov %r8, 1*8(%rdi)");
__asm__("mov 2*8(%rsi), %r9");
__asm__("mov 3*8(%rdi), %r10");
__asm__("mov %r9, 2*8(%rdi)");
__asm__("mov %r10, 3*8(%rdi)");
__asm__("mov 4*8(%rsi), %r11");
__asm__("mov 5*8(%rdi), %r8");
__asm__("mov %r11, 4*8(%rdi)");
__asm__("mov %r8, 5*8(%rdi)");
__asm__("mov 6*8(%rsi), %r9");
__asm__("mov 7*8(%rdi), %r10");
__asm__("mov %r9, 6*8(%rdi)");
__asm__("mov %r10, 7*8(%rdi)");
__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");
__asm__("jnz 1b");
__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shrl $3, %ecx");
__asm__("jz 4f");
__asm__("3:");
__asm__("decl %ecx");
__asm__("mov (%rsi), %r8");
__asm__("mov %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");
__asm__("4:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 6f");
__asm__("5:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 5b");
__asm__("6:");
}


void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new");
for(i = 0; i< 128;i= 1 + i) {
do_test(0, 0, i);
do_test(0, 8, i);
do_test(4, 16, i);
do_test(0, 16, i);
}
do_test(4, 16, 1024);
do_test(0, 0, 1024);
do_test(4, 16, 4096);
do_test(0, 8, 4096);
do_test(0, 16, 4096);
do_test(0, 64, 4096);
do_test(0, 0, 4096);
do_test(16, 0, 4096*2);
do_test(0, 16, 4096*2);
return ;
}