RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S byfast string.

From: Ma, Ling
Date: Thu Nov 12 2009 - 02:43:25 EST


Hi H. Peter Anvin

After running the test program in my attachment-memcpy.c on Nehalem platform,
when copy size is less 1024 memcopy_c function has very big regression compared
with original memcopy function. I think we have to combine original memcopy and
memcpy_c for Nehalem and other modern CPUS, so memcpy_new is on the right track.

Thanks
Ling
(

>-----Original Message-----
>From: H. Peter Anvin [mailto:hpa@xxxxxxxxx]
>Sent: 2009年11月12日 13:27
>To: Ma, Ling
>Cc: Cyrill Gorcunov; Ingo Molnar; Ingo Molnar; Thomas Gleixner; linux-kernel
>Subject: Re: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast
>string.
>
>On 11/11/2009 08:49 PM, Ma, Ling wrote:
>> Hi All
>> The attachment is latest memcpy.c, please update by
>> "cc -o memcpy memcpy.c -O2 -m64".
>
>OK... given that there seems to be no point since the actual code we're
>talking about modifying doesn't ever actually get executed on the real
>kernel, we can just drop this, right?
>
> -hpa
>
>--
>H. Peter Anvin, Intel Open Source Technology Center
>I work for Intel. I don't speak on their behalf.

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 1000
#define MAXCOPYSIZE (1024 * 32)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)

#define HP_TIMING_BEST(best_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
if (best_time > tmptime) \
best_time = tmptime; \
} \
while (0)


void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_throughput ( char *dst, char *src,
size_t len)
{

__asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t best_time = ~ (hp_timing_t) 0;
size_t i;

__asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
HP_TIMING_NOW (start);
for (i = 0; i < MAXSAMPLESTPT; ++i)
do_memcpy ( dst, src, len);
HP_TIMING_NOW (stop);
HP_TIMING_BEST (best_time, start, stop);

printf ("\t%zd", (size_t) best_time);

}

static void
do_tpt_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;

s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);


printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = memcpy_orig;
do_one_throughput (s2, s1, len);
do_memcpy = memcpy_new;
do_one_throughput (s2, s1, len);
do_memcpy = memcpy_c;
do_one_throughput (s2, s1, len);

putchar ('\n');
}

static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);

for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}

}

void memcpy_c(char *dst, char *src, int len)
{

__asm__("movq %rdi, %rax");

__asm__("movl %edx, %ecx");
__asm__("shrl $3, %ecx");
__asm__("andl $7, %edx");
__asm__("rep movsq");
__asm__("movl %edx, %ecx");
__asm__("rep movsb");

}
void memcpy_new(char *dst, char *src, int len)
{
__asm__("movq %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");

__asm__("cmp $0x400, %edx");
__asm__("jae 7f");

__asm__("1:");
__asm__("decl %ecx");

__asm__("movq 0*8(%rsi), %r11");
__asm__("movq 1*8(%rdi), %r8");
__asm__("movq %r11, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");

__asm__("movq 2*8(%rsi), %r9");
__asm__("movq 3*8(%rdi), %r10");
__asm__("movq %r9, 2*8(%rdi)");
__asm__("movq %r10, 3*8(%rdi)");

__asm__("movq 4*8(%rsi), %r11");
__asm__("movq 5*8(%rdi), %r8");
__asm__("movq %r11, 4*8(%rdi)");
__asm__("movq %r8, 5*8(%rdi)");

__asm__("movq 6*8(%rsi), %r9");
__asm__("movq 7*8(%rdi), %r10");
__asm__("movq %r9, 6*8(%rdi)");
__asm__("movq %r10, 7*8(%rdi)");

__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");

__asm__("jnz 1b");

__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shrl $3, %ecx");
__asm__("jz 4f");


__asm__("3:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %r8");
__asm__("movq %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");

__asm__("4:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 6f");

__asm__("5:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 5b");

__asm__("6:");
__asm__("retq");

__asm__("7:");
__asm__("movl %edx, %ecx");
__asm__ ("shr $3, %ecx");
__asm__ ("andl $7, %edx");
__asm__("rep movsq ");
__asm__ ("jz 8f");
__asm__("movl %edx, %ecx");
__asm__("rep movsb");

__asm__("8:");
}
void memcpy_orig(char *dst, char *src, int len)
{
__asm__("movq %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");

__asm__("mov $0x80, %r8d "); /*aligned case for loop 1 */

__asm__("1:");
__asm__("decl %ecx");

__asm__("movq 0*8(%rsi), %r11");
__asm__("movq 1*8(%rdi), %r8");
__asm__("movq %r11, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");

__asm__("movq 2*8(%rsi), %r9");
__asm__("movq 3*8(%rdi), %r10");
__asm__("movq %r9, 2*8(%rdi)");
__asm__("movq %r10, 3*8(%rdi)");

__asm__("movq 4*8(%rsi), %r11");
__asm__("movq 5*8(%rdi), %r8");
__asm__("movq %r11, 4*8(%rdi)");
__asm__("movq %r8, 5*8(%rdi)");

__asm__("movq 6*8(%rsi), %r9");
__asm__("movq 7*8(%rdi), %r10");
__asm__("movq %r9, 6*8(%rdi)");
__asm__("movq %r10, 7*8(%rdi)");

__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");

__asm__("jnz 1b");

__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shrl $3, %ecx");
__asm__("jz 4f");


__asm__("3:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %r8");
__asm__("movq %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");

__asm__("4:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 6f");

__asm__("5:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 5b");

__asm__("6:");
}


void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\t%s\n", "memcpy_orig", "memcpy_new", "memcpy_c");

for (i = 0; i < 64; i = i+ 1)
do_tpt_test(0, 0, i);
do_tpt_test(0, 0, 1023);
do_tpt_test(0, 0, 1024);
do_tpt_test(0, 0, 2048);

}